|
import gradio as gr |
|
import numpy as np |
|
import pandas as pd |
|
import sentencepiece |
|
|
|
|
|
from transformers import BertJapaneseTokenizer, BertModel |
|
import torch |
|
|
|
USER_NAME = "user" |
|
ASSISTANT_NAME = "assistant" |
|
MODEL_NAME = 'cl-tohoku/bert-base-japanese-whole-word-masking' |
|
tokenizer = BertJapaneseTokenizer.from_pretrained(MODEL_NAME) |
|
model = BertModel.from_pretrained(MODEL_NAME) |
|
model_df = pd.read_csv('dict.csv') |
|
|
|
def sentence_to_vector(model, tokenizer, sentence): |
|
|
|
tokens = tokenizer(sentence)["input_ids"] |
|
|
|
input = torch.tensor(tokens).reshape(1,-1) |
|
|
|
with torch.no_grad(): |
|
outputs = model(input, output_hidden_states=True) |
|
last_hidden_state = outputs.last_hidden_state[0] |
|
averaged_hidden_state = last_hidden_state.sum(dim=0) / len(last_hidden_state) |
|
return averaged_hidden_state |
|
|
|
def cosine_similarity(x1, x2, eps): |
|
w12 = torch.sum(x1 * x2) |
|
w1 = torch.sum(x1 * x1) |
|
w2 = torch.sum(x2 * x2) |
|
n12 = (w1 * w2).clamp_min_(eps * eps).sqrt_() |
|
score = w12 / n12 |
|
score = score.item() |
|
return score |
|
|
|
def calc_similarity(sentence1, sentence2): |
|
sentence_vector1 = sentence_to_vector(model, tokenizer, sentence1) |
|
sentence_vector2 = sentence_to_vector(model, tokenizer, sentence2) |
|
score = cosine_similarity(sentence_vector1, sentence_vector2, 1e-8) |
|
return score |
|
|
|
def chat(user_msg): |
|
sentence1 = user_msg |
|
similar_value = 0 |
|
similar_word = "" |
|
for i in range(60): |
|
sentence2 = "" |
|
value = 0 |
|
sentence2 = model_df["question"][i] |
|
value = calc_similarity(sentence1, sentence2) |
|
if value > similar_value: |
|
similar_value = value |
|
similar_word = model_df["answer"][i] |
|
return similar_word,similar_value |
|
|
|
iface = gr.Interface(fn=chat, inputs="text", outputs=["text","number"]) |
|
iface.launch() |