chatbot / app0.py
pyamath's picture
Rename app.py to app0.py
10be201
import gradio as gr
import numpy as np
import pandas as pd
import sentencepiece
# from sentence_transformers import SentenceTransformer
# from sentence_transformers import models
from transformers import BertJapaneseTokenizer, BertModel
import torch
USER_NAME = "user"
ASSISTANT_NAME = "assistant"
MODEL_NAME = 'cl-tohoku/bert-base-japanese-whole-word-masking'
tokenizer = BertJapaneseTokenizer.from_pretrained(MODEL_NAME)
model = BertModel.from_pretrained(MODEL_NAME)
model_df = pd.read_csv('dict.csv')
def sentence_to_vector(model, tokenizer, sentence):
# 文を単語に区切って数字にラベル化
tokens = tokenizer(sentence)["input_ids"]
# BERTモデルの処理のためtensor型に変換
input = torch.tensor(tokens).reshape(1,-1)
# BERTモデルに入力し文のベクトルを取得
with torch.no_grad():
outputs = model(input, output_hidden_states=True)
last_hidden_state = outputs.last_hidden_state[0]
averaged_hidden_state = last_hidden_state.sum(dim=0) / len(last_hidden_state)
return averaged_hidden_state
def cosine_similarity(x1, x2, eps): # dimは単純化のため省略
w12 = torch.sum(x1 * x2)
w1 = torch.sum(x1 * x1)
w2 = torch.sum(x2 * x2)
n12 = (w1 * w2).clamp_min_(eps * eps).sqrt_()
score = w12 / n12
score = score.item()
return score
def calc_similarity(sentence1, sentence2):
sentence_vector1 = sentence_to_vector(model, tokenizer, sentence1)
sentence_vector2 = sentence_to_vector(model, tokenizer, sentence2)
score = cosine_similarity(sentence_vector1, sentence_vector2, 1e-8)
return score
def chat(user_msg):
sentence1 = user_msg
similar_value = 0
similar_word = ""
for i in range(60):
sentence2 = ""
value = 0
sentence2 = model_df["question"][i]
value = calc_similarity(sentence1, sentence2)
if value > similar_value:
similar_value = value
similar_word = model_df["answer"][i]
return similar_word,similar_value
iface = gr.Interface(fn=chat, inputs="text", outputs=["text","number"])
iface.launch()