Spaces:
Build error
Squashed commit of the following:
Browse filescommit d2752e15374f35abd5db689869e92d1f63f79616
Author: Mya-Mya <39019907+Mya-Mya@users.noreply.github.com>
Date: Sat Aug 13 23:46:15 2022 +0900
Create apps
commit 0093e017053f317dc126f7e743c199d02c4f6771
Author: Mya-Mya <39019907+Mya-Mya@users.noreply.github.com>
Date: Sat Aug 13 23:45:58 2022 +0900
Update frontend.py
commit 53c9b7fadb24ee0b482327507dc03fa41f1360b5
Author: Mya-Mya <39019907+Mya-Mya@users.noreply.github.com>
Date: Sat Aug 13 23:45:56 2022 +0900
Delete backend.py
commit f0f53b78d0caaeed81a8f97c754c74e352fde63e
Author: Mya-Mya <39019907+Mya-Mya@users.noreply.github.com>
Date: Sat Aug 13 23:45:46 2022 +0900
Create ambiguous_search_backends.py
commit ab080f60db316d536cb913da1fbf0fb4253e4b35
Author: Mya-Mya <39019907+Mya-Mya@users.noreply.github.com>
Date: Sat Aug 13 23:45:43 2022 +0900
Create classic_search_backends.py
commit c414d550d4199032c3102b8821336cafe6b9722e
Author: Mya-Mya <39019907+Mya-Mya@users.noreply.github.com>
Date: Sat Aug 13 23:45:34 2022 +0900
Update himitsudogu_db.pkl
commit e4de043bfad19382a10f84e3e240c06245471dc0
Author: Mya-Mya <39019907+Mya-Mya@users.noreply.github.com>
Date: Sat Aug 13 23:45:28 2022 +0900
Create 1.pkl
commit 4db15a6d5e4ba8cc533c904f9c39378493b979be
Author: Mya-Mya <39019907+Mya-Mya@users.noreply.github.com>
Date: Sat Aug 13 23:45:23 2022 +0900
Delete localtest.py
- ambiguous_search_backends.py +60 -0
- app.py +6 -46
- backend.py +0 -6
- classic_search_backends.py +32 -0
- frontend.py +88 -14
- himitsudogu_db.pkl +2 -2
- localtest.py +0 -35
- nobert_app.py +8 -0
- old_himitsudogu_db/1.pkl +3 -0
@@ -0,0 +1,60 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from abc import ABC, abstractmethod
|
2 |
+
from pandas import DataFrame
|
3 |
+
from transformers import BertJapaneseTokenizer, BertModel
|
4 |
+
import pickle
|
5 |
+
import numpy as np
|
6 |
+
|
7 |
+
|
8 |
+
class AmbiguousSearchBackend(ABC):
|
9 |
+
@abstractmethod
|
10 |
+
def submit(self, query: str) -> DataFrame:
|
11 |
+
pass
|
12 |
+
|
13 |
+
|
14 |
+
class DummyAmbiguousSearchBackend(AmbiguousSearchBackend):
|
15 |
+
def submit(self, query: str) -> DataFrame:
|
16 |
+
return DataFrame(
|
17 |
+
{
|
18 |
+
"類似度": [1, 0.9, 0.8, 0.7],
|
19 |
+
"名前": ["A", "B", "C", "D"],
|
20 |
+
"説明": ["a", "b", "c", "d"],
|
21 |
+
}
|
22 |
+
)
|
23 |
+
|
24 |
+
|
25 |
+
class SBAmbiguousSearchBackend(AmbiguousSearchBackend):
|
26 |
+
def __init__(self):
|
27 |
+
super().__init__()
|
28 |
+
with open("./himitsudogu_db.pkl", "rb") as file:
|
29 |
+
self.himitsudogu_db: dict = pickle.load(file)
|
30 |
+
self.feature_matrix = self.himitsudogu_db["feature_matrix_s"][
|
31 |
+
"sonoisa/sentence-bert-base-ja-mean-tokens-v2"
|
32 |
+
]
|
33 |
+
# モデルsonoisa/sentence-bert-base-ja-mean-tokens-v2を使用する
|
34 |
+
self.tokenizer = BertJapaneseTokenizer.from_pretrained(
|
35 |
+
"sonoisa/sentence-bert-base-ja-mean-tokens-v2"
|
36 |
+
)
|
37 |
+
self.model = BertModel.from_pretrained(
|
38 |
+
"sonoisa/sentence-bert-base-ja-mean-tokens-v2"
|
39 |
+
)
|
40 |
+
def submit(self, query: str) -> DataFrame:
|
41 |
+
# 文章を形態素解析し、形態素ID列へ変換
|
42 |
+
tokenized = self.tokenizer(query, return_tensors="pt")
|
43 |
+
# 言語モデルへ形態素ID列を代入
|
44 |
+
output = self.model(**tokenized)
|
45 |
+
# 文章の特徴ベクトルを取得
|
46 |
+
pooler_output = output["pooler_output"]
|
47 |
+
query_feature_vector = pooler_output[0].detach().numpy()
|
48 |
+
query_feature_unit_vector = query_feature_vector/np.linalg.norm(query_feature_vector)
|
49 |
+
# 各ひみつ道具の説明文の特徴ベクトルとの内積を取る
|
50 |
+
cs_s = self.feature_matrix @ query_feature_unit_vector
|
51 |
+
# 内積が大きかったもの順にひみつ道具を表示するようにする
|
52 |
+
ranked_index_s = np.argsort(cs_s)[::-1]
|
53 |
+
output = DataFrame(columns=["類似度", "名前", "説明"])
|
54 |
+
for rank, i in enumerate(ranked_index_s[:20], 1):
|
55 |
+
output.loc[rank] = [
|
56 |
+
cs_s[i],
|
57 |
+
self.himitsudogu_db["name_s"][i],
|
58 |
+
self.himitsudogu_db["description_s"][i],
|
59 |
+
]
|
60 |
+
return output
|
@@ -1,48 +1,8 @@
|
|
1 |
-
from
|
|
|
2 |
import frontend
|
3 |
-
import numpy as np
|
4 |
-
from pandas import DataFrame
|
5 |
-
from transformers import BertJapaneseTokenizer, BertModel
|
6 |
-
import pickle
|
7 |
|
8 |
-
|
9 |
-
|
10 |
-
|
11 |
-
|
12 |
-
class HFBackend(Backend):
|
13 |
-
def __init__(self):
|
14 |
-
super().__init__()
|
15 |
-
self.feature_matrix = himitsudogu_db["feature_matrix_s"][
|
16 |
-
"sonoisa/sentence-bert-base-ja-mean-tokens-v2"
|
17 |
-
]
|
18 |
-
# モデルsonoisa/sentence-bert-base-ja-mean-tokens-v2を使用する
|
19 |
-
self.tokenizer = BertJapaneseTokenizer.from_pretrained(
|
20 |
-
"sonoisa/sentence-bert-base-ja-mean-tokens-v2"
|
21 |
-
)
|
22 |
-
self.model = BertModel.from_pretrained(
|
23 |
-
"sonoisa/sentence-bert-base-ja-mean-tokens-v2"
|
24 |
-
)
|
25 |
-
|
26 |
-
def on_submit_button_press(self, query: str) -> DataFrame:
|
27 |
-
# 文章を形態素解析し、形態素ID列へ変換
|
28 |
-
tokenized = self.tokenizer(query, return_tensors="pt")
|
29 |
-
# 言語モデルへ形態素ID列を代入
|
30 |
-
output = self.model(**tokenized)
|
31 |
-
# 文章の特徴ベクトルを取得
|
32 |
-
pooler_output = output["pooler_output"]
|
33 |
-
query_feature_vector = pooler_output[0].detach().numpy()
|
34 |
-
# 各ひみつ道具の説明文の特徴ベクトルとの内積を取る
|
35 |
-
cs_s = self.feature_matrix @ query_feature_vector
|
36 |
-
# 内積が大きかったもの順にひみつ道具を表示するようにする
|
37 |
-
ranked_index_s = np.argsort(cs_s)[::-1]
|
38 |
-
output = DataFrame(columns=["類似度", "名前", "説明"])
|
39 |
-
for rank, i in enumerate(ranked_index_s[:20], 1):
|
40 |
-
output.loc[rank] = [
|
41 |
-
cs_s[i],
|
42 |
-
himitsudogu_db["name_s"][i],
|
43 |
-
himitsudogu_db["description_s"][i],
|
44 |
-
]
|
45 |
-
return output
|
46 |
-
|
47 |
-
|
48 |
-
frontend.launch_frontend(backend=HFBackend())
|
|
|
1 |
+
from ambiguous_search_backends import SBAmbiguousSearchBackend
|
2 |
+
from classic_search_backends import ImplClassicSearchBackend
|
3 |
import frontend
|
|
|
|
|
|
|
|
|
4 |
|
5 |
+
frontend.launch_frontend(
|
6 |
+
ambiguous_search_backend=SBAmbiguousSearchBackend(),
|
7 |
+
classic_search_backend=ImplClassicSearchBackend(),
|
8 |
+
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
@@ -1,6 +0,0 @@
|
|
1 |
-
from abc import ABC,abstractmethod
|
2 |
-
from pandas import DataFrame
|
3 |
-
class Backend(ABC):
|
4 |
-
@abstractmethod
|
5 |
-
def on_submit_button_press(self,query:str)->DataFrame:
|
6 |
-
pass
|
|
|
|
|
|
|
|
|
|
|
|
|
|
@@ -0,0 +1,32 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from abc import ABC, abstractmethod
|
2 |
+
from pandas import DataFrame
|
3 |
+
import re
|
4 |
+
import pickle
|
5 |
+
|
6 |
+
class ClassicSearchBackend(ABC):
|
7 |
+
@abstractmethod
|
8 |
+
def submit(self, query:str)->DataFrame:
|
9 |
+
pass
|
10 |
+
@abstractmethod
|
11 |
+
def get_num_items(self)->int:
|
12 |
+
pass
|
13 |
+
|
14 |
+
class ImplClassicSearchBackend(ClassicSearchBackend):
|
15 |
+
def __init__(self):
|
16 |
+
super().__init__()
|
17 |
+
with open("./himitsudogu_db.pkl", "rb") as file:
|
18 |
+
self.himitsudogu_db: dict = pickle.load(file)
|
19 |
+
def submit(self, query: str) -> DataFrame:
|
20 |
+
pattern = ".*"+query+".*"
|
21 |
+
output = DataFrame(columns=["名前","説明"])
|
22 |
+
for i,(name,description) in enumerate(zip(
|
23 |
+
self.himitsudogu_db["name_s"],self.himitsudogu_db["description_s"]
|
24 |
+
)):
|
25 |
+
if re.match(pattern,name+description):
|
26 |
+
output.loc[i] = [
|
27 |
+
name,
|
28 |
+
description
|
29 |
+
]
|
30 |
+
return output
|
31 |
+
def get_num_items(self)->int:
|
32 |
+
return len(self.himitsudogu_db["name_s"])
|
@@ -1,15 +1,89 @@
|
|
1 |
-
from gradio import
|
2 |
-
|
3 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
4 |
with Blocks() as frontend:
|
5 |
-
Markdown(""
|
6 |
-
|
7 |
-
|
8 |
-
|
9 |
-
|
10 |
-
|
11 |
-
|
12 |
-
|
13 |
-
|
14 |
-
|
15 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from gradio import (
|
2 |
+
update,
|
3 |
+
Blocks,
|
4 |
+
Row,
|
5 |
+
Examples,
|
6 |
+
Group,
|
7 |
+
Markdown,
|
8 |
+
Textbox,
|
9 |
+
Button,
|
10 |
+
Tabs,
|
11 |
+
TabItem,
|
12 |
+
DataFrame as GRDataFrame,
|
13 |
+
)
|
14 |
+
from ambiguous_search_backends import AmbiguousSearchBackend
|
15 |
+
from classic_search_backends import ClassicSearchBackend
|
16 |
+
|
17 |
+
|
18 |
+
def launch_frontend(
|
19 |
+
ambiguous_search_backend: AmbiguousSearchBackend,
|
20 |
+
classic_search_backend: ClassicSearchBackend,
|
21 |
+
):
|
22 |
+
def on_ambiguous_search_click(query: str):
|
23 |
+
result = ambiguous_search_backend.submit(query)
|
24 |
+
return GRDataFrame.update(value=result, visible=True)
|
25 |
+
|
26 |
+
def on_ambiguous_search_query_clear_click():
|
27 |
+
return Textbox.update(value="")
|
28 |
+
|
29 |
+
def on_classic_search_click(query: str):
|
30 |
+
result = classic_search_backend.submit(query)
|
31 |
+
return GRDataFrame.update(value=result, visible=True)
|
32 |
+
|
33 |
with Blocks() as frontend:
|
34 |
+
Markdown("## Himitsudogu Search")
|
35 |
+
with Tabs():
|
36 |
+
with TabItem("あいまい検索"):
|
37 |
+
Markdown(
|
38 |
+
"""
|
39 |
+
自然言語処理モデル"Sentence BERT"を使って、ドラえもんのひみつ道具をあいまい検索。
|
40 |
+
"""
|
41 |
+
)
|
42 |
+
query_textbox = Textbox(
|
43 |
+
label="自分の言葉でひみつ道具を説明してください", max_lines=1
|
44 |
+
)
|
45 |
+
with Row():
|
46 |
+
submit_button = Button("検索", variant="primary")
|
47 |
+
clear_button = Button("クリア")
|
48 |
+
result_table = GRDataFrame(visible=False)
|
49 |
+
Examples(
|
50 |
+
examples=[
|
51 |
+
"ふりかけると水を色々な性質にできる",
|
52 |
+
"小さいカメラが飛ぶ",
|
53 |
+
"壁を通り抜けられるようにする輪",
|
54 |
+
"地図をいじって実際の町の建物などの位置を変える",
|
55 |
+
"歌声を綺麗にする",
|
56 |
+
],
|
57 |
+
inputs=query_textbox,
|
58 |
+
)
|
59 |
+
|
60 |
+
submit_button.click(
|
61 |
+
on_ambiguous_search_click,
|
62 |
+
inputs=query_textbox,
|
63 |
+
outputs=result_table,
|
64 |
+
)
|
65 |
+
clear_button.click(
|
66 |
+
on_ambiguous_search_query_clear_click,
|
67 |
+
inputs=None,
|
68 |
+
outputs=query_textbox,
|
69 |
+
)
|
70 |
+
|
71 |
+
with TabItem("古典検索"):
|
72 |
+
Markdown(
|
73 |
+
f"""
|
74 |
+
キーワードから検索。
|
75 |
+
検索可能なひみつ道具は全{classic_search_backend.get_num_items()}種類。
|
76 |
+
"""
|
77 |
+
)
|
78 |
+
query_textbox = Textbox(label="キーワード", max_lines=1)
|
79 |
+
with Row():
|
80 |
+
submit_button = Button("検索", variant="primary")
|
81 |
+
result_table = GRDataFrame(visible=False)
|
82 |
+
|
83 |
+
submit_button.click(
|
84 |
+
on_classic_search_click,
|
85 |
+
inputs=query_textbox,
|
86 |
+
outputs=result_table,
|
87 |
+
)
|
88 |
+
|
89 |
+
frontend.launch()
|
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
-
size
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:f87a588b29a0dfcfd4eb53710cec37613dd5d873c13772c6cb7b570ae5264a65
|
3 |
+
size 4203008
|
@@ -1,35 +0,0 @@
|
|
1 |
-
from backend import Backend
|
2 |
-
import frontend
|
3 |
-
import numpy as np
|
4 |
-
from pandas import DataFrame
|
5 |
-
import pickle
|
6 |
-
|
7 |
-
with open("./himitsudogu_db.pkl", "rb") as file:
|
8 |
-
himitsudogu_db: dict = pickle.load(file)
|
9 |
-
|
10 |
-
class LocaltestBackend(Backend):
|
11 |
-
"""
|
12 |
-
ローカル開発環境にSentence BERTを連れ込んでくるのは無理なので、
|
13 |
-
文章をベクトル化する行程は乱数発生とする。
|
14 |
-
"""
|
15 |
-
|
16 |
-
def __init__(self):
|
17 |
-
super().__init__()
|
18 |
-
self.feature_matrix = himitsudogu_db["feature_matrix_s"][
|
19 |
-
"sonoisa/sentence-bert-base-ja-mean-tokens-v2"
|
20 |
-
]
|
21 |
-
|
22 |
-
def on_submit_button_press(self, query: str) -> DataFrame:
|
23 |
-
query_feature_vector = np.random.random((768,))
|
24 |
-
cs_s = self.feature_matrix @ query_feature_vector
|
25 |
-
ranked_index_s = np.argsort(cs_s)[::-1]
|
26 |
-
output = DataFrame(columns=["類似度", "名前", "説明"])
|
27 |
-
for rank, i in enumerate(ranked_index_s[:20], 1):
|
28 |
-
output.loc[rank] = [
|
29 |
-
cs_s[i],
|
30 |
-
himitsudogu_db["name_s"][i],
|
31 |
-
himitsudogu_db["description_s"][i],
|
32 |
-
]
|
33 |
-
return output
|
34 |
-
|
35 |
-
frontend.launch_frontend(backend=LocaltestBackend())
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
@@ -0,0 +1,8 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from ambiguous_search_backends import DummyAmbiguousSearchBackend
|
2 |
+
from classic_search_backends import ImplClassicSearchBackend
|
3 |
+
import frontend
|
4 |
+
|
5 |
+
frontend.launch_frontend(
|
6 |
+
ambiguous_search_backend=DummyAmbiguousSearchBackend(),
|
7 |
+
classic_search_backend=ImplClassicSearchBackend(),
|
8 |
+
)
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:0cb7f694ed31065ed801d932082a802042a3a224662c7cb45f01651d5ba43ccc
|
3 |
+
size 20793904
|