Mya-Mya commited on
Commit
9f1a6f8
1 Parent(s): e9b49df

Squashed commit of the following:

Browse files

commit d2752e15374f35abd5db689869e92d1f63f79616
Author: Mya-Mya <39019907+Mya-Mya@users.noreply.github.com>
Date: Sat Aug 13 23:46:15 2022 +0900

Create apps

commit 0093e017053f317dc126f7e743c199d02c4f6771
Author: Mya-Mya <39019907+Mya-Mya@users.noreply.github.com>
Date: Sat Aug 13 23:45:58 2022 +0900

Update frontend.py

commit 53c9b7fadb24ee0b482327507dc03fa41f1360b5
Author: Mya-Mya <39019907+Mya-Mya@users.noreply.github.com>
Date: Sat Aug 13 23:45:56 2022 +0900

Delete backend.py

commit f0f53b78d0caaeed81a8f97c754c74e352fde63e
Author: Mya-Mya <39019907+Mya-Mya@users.noreply.github.com>
Date: Sat Aug 13 23:45:46 2022 +0900

Create ambiguous_search_backends.py

commit ab080f60db316d536cb913da1fbf0fb4253e4b35
Author: Mya-Mya <39019907+Mya-Mya@users.noreply.github.com>
Date: Sat Aug 13 23:45:43 2022 +0900

Create classic_search_backends.py

commit c414d550d4199032c3102b8821336cafe6b9722e
Author: Mya-Mya <39019907+Mya-Mya@users.noreply.github.com>
Date: Sat Aug 13 23:45:34 2022 +0900

Update himitsudogu_db.pkl

commit e4de043bfad19382a10f84e3e240c06245471dc0
Author: Mya-Mya <39019907+Mya-Mya@users.noreply.github.com>
Date: Sat Aug 13 23:45:28 2022 +0900

Create 1.pkl

commit 4db15a6d5e4ba8cc533c904f9c39378493b979be
Author: Mya-Mya <39019907+Mya-Mya@users.noreply.github.com>
Date: Sat Aug 13 23:45:23 2022 +0900

Delete localtest.py

ambiguous_search_backends.py ADDED
@@ -0,0 +1,60 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from abc import ABC, abstractmethod
2
+ from pandas import DataFrame
3
+ from transformers import BertJapaneseTokenizer, BertModel
4
+ import pickle
5
+ import numpy as np
6
+
7
+
8
+ class AmbiguousSearchBackend(ABC):
9
+ @abstractmethod
10
+ def submit(self, query: str) -> DataFrame:
11
+ pass
12
+
13
+
14
+ class DummyAmbiguousSearchBackend(AmbiguousSearchBackend):
15
+ def submit(self, query: str) -> DataFrame:
16
+ return DataFrame(
17
+ {
18
+ "類似度": [1, 0.9, 0.8, 0.7],
19
+ "名前": ["A", "B", "C", "D"],
20
+ "説明": ["a", "b", "c", "d"],
21
+ }
22
+ )
23
+
24
+
25
+ class SBAmbiguousSearchBackend(AmbiguousSearchBackend):
26
+ def __init__(self):
27
+ super().__init__()
28
+ with open("./himitsudogu_db.pkl", "rb") as file:
29
+ self.himitsudogu_db: dict = pickle.load(file)
30
+ self.feature_matrix = self.himitsudogu_db["feature_matrix_s"][
31
+ "sonoisa/sentence-bert-base-ja-mean-tokens-v2"
32
+ ]
33
+ # モデルsonoisa/sentence-bert-base-ja-mean-tokens-v2を使用する
34
+ self.tokenizer = BertJapaneseTokenizer.from_pretrained(
35
+ "sonoisa/sentence-bert-base-ja-mean-tokens-v2"
36
+ )
37
+ self.model = BertModel.from_pretrained(
38
+ "sonoisa/sentence-bert-base-ja-mean-tokens-v2"
39
+ )
40
+ def submit(self, query: str) -> DataFrame:
41
+ # 文章を形態素解析し、形態素ID列へ変換
42
+ tokenized = self.tokenizer(query, return_tensors="pt")
43
+ # 言語モデルへ形態素ID列を代入
44
+ output = self.model(**tokenized)
45
+ # 文章の特徴ベクトルを取得
46
+ pooler_output = output["pooler_output"]
47
+ query_feature_vector = pooler_output[0].detach().numpy()
48
+ query_feature_unit_vector = query_feature_vector/np.linalg.norm(query_feature_vector)
49
+ # 各ひみつ道具の説明文の特徴ベクトルとの内積を取る
50
+ cs_s = self.feature_matrix @ query_feature_unit_vector
51
+ # 内積が大きかったもの順にひみつ道具を表示するようにする
52
+ ranked_index_s = np.argsort(cs_s)[::-1]
53
+ output = DataFrame(columns=["類似度", "名前", "説明"])
54
+ for rank, i in enumerate(ranked_index_s[:20], 1):
55
+ output.loc[rank] = [
56
+ cs_s[i],
57
+ self.himitsudogu_db["name_s"][i],
58
+ self.himitsudogu_db["description_s"][i],
59
+ ]
60
+ return output
app.py CHANGED
@@ -1,48 +1,8 @@
1
- from backend import Backend
 
2
  import frontend
3
- import numpy as np
4
- from pandas import DataFrame
5
- from transformers import BertJapaneseTokenizer, BertModel
6
- import pickle
7
 
8
- with open("./himitsudogu_db.pkl", "rb") as file:
9
- himitsudogu_db: dict = pickle.load(file)
10
-
11
-
12
- class HFBackend(Backend):
13
- def __init__(self):
14
- super().__init__()
15
- self.feature_matrix = himitsudogu_db["feature_matrix_s"][
16
- "sonoisa/sentence-bert-base-ja-mean-tokens-v2"
17
- ]
18
- # モデルsonoisa/sentence-bert-base-ja-mean-tokens-v2を使用する
19
- self.tokenizer = BertJapaneseTokenizer.from_pretrained(
20
- "sonoisa/sentence-bert-base-ja-mean-tokens-v2"
21
- )
22
- self.model = BertModel.from_pretrained(
23
- "sonoisa/sentence-bert-base-ja-mean-tokens-v2"
24
- )
25
-
26
- def on_submit_button_press(self, query: str) -> DataFrame:
27
- # 文章を形態素解析し、形態素ID列へ変換
28
- tokenized = self.tokenizer(query, return_tensors="pt")
29
- # 言語モデルへ形態素ID列を代入
30
- output = self.model(**tokenized)
31
- # 文章の特徴ベクトルを取得
32
- pooler_output = output["pooler_output"]
33
- query_feature_vector = pooler_output[0].detach().numpy()
34
- # 各ひみつ道具の説明文の特徴ベクトルとの内積を取る
35
- cs_s = self.feature_matrix @ query_feature_vector
36
- # 内積が大きかったもの順にひみつ道具を表示するようにする
37
- ranked_index_s = np.argsort(cs_s)[::-1]
38
- output = DataFrame(columns=["類似度", "名前", "説明"])
39
- for rank, i in enumerate(ranked_index_s[:20], 1):
40
- output.loc[rank] = [
41
- cs_s[i],
42
- himitsudogu_db["name_s"][i],
43
- himitsudogu_db["description_s"][i],
44
- ]
45
- return output
46
-
47
-
48
- frontend.launch_frontend(backend=HFBackend())
 
1
+ from ambiguous_search_backends import SBAmbiguousSearchBackend
2
+ from classic_search_backends import ImplClassicSearchBackend
3
  import frontend
 
 
 
 
4
 
5
+ frontend.launch_frontend(
6
+ ambiguous_search_backend=SBAmbiguousSearchBackend(),
7
+ classic_search_backend=ImplClassicSearchBackend(),
8
+ )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
backend.py DELETED
@@ -1,6 +0,0 @@
1
- from abc import ABC,abstractmethod
2
- from pandas import DataFrame
3
- class Backend(ABC):
4
- @abstractmethod
5
- def on_submit_button_press(self,query:str)->DataFrame:
6
- pass
 
 
 
 
 
 
 
classic_search_backends.py ADDED
@@ -0,0 +1,32 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from abc import ABC, abstractmethod
2
+ from pandas import DataFrame
3
+ import re
4
+ import pickle
5
+
6
+ class ClassicSearchBackend(ABC):
7
+ @abstractmethod
8
+ def submit(self, query:str)->DataFrame:
9
+ pass
10
+ @abstractmethod
11
+ def get_num_items(self)->int:
12
+ pass
13
+
14
+ class ImplClassicSearchBackend(ClassicSearchBackend):
15
+ def __init__(self):
16
+ super().__init__()
17
+ with open("./himitsudogu_db.pkl", "rb") as file:
18
+ self.himitsudogu_db: dict = pickle.load(file)
19
+ def submit(self, query: str) -> DataFrame:
20
+ pattern = ".*"+query+".*"
21
+ output = DataFrame(columns=["名前","説明"])
22
+ for i,(name,description) in enumerate(zip(
23
+ self.himitsudogu_db["name_s"],self.himitsudogu_db["description_s"]
24
+ )):
25
+ if re.match(pattern,name+description):
26
+ output.loc[i] = [
27
+ name,
28
+ description
29
+ ]
30
+ return output
31
+ def get_num_items(self)->int:
32
+ return len(self.himitsudogu_db["name_s"])
frontend.py CHANGED
@@ -1,15 +1,89 @@
1
- from gradio import Blocks,Row,Group,Markdown,Textbox,Button,DataFrame as GRDataFrame
2
- from backend import Backend
3
- def launch_frontend(backend:Backend):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4
  with Blocks() as frontend:
5
- Markdown("""
6
- 自然言語処理モデル"Sentence BERT"を使って、ドラえもんのひみつ道具をあいまい検索。
7
- 自分の言葉でひみつ道具を説明してください。
8
- """)
9
- with Group():
10
- query_textbox = Textbox(label="ひみつ道具の説明",max_lines=1)
11
- submit_button = Button("検索")
12
- result_table = GRDataFrame(label="検索結果")
13
-
14
- submit_button.click(backend.on_submit_button_press,inputs=query_textbox,outputs=result_table)
15
- frontend.launch()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from gradio import (
2
+ update,
3
+ Blocks,
4
+ Row,
5
+ Examples,
6
+ Group,
7
+ Markdown,
8
+ Textbox,
9
+ Button,
10
+ Tabs,
11
+ TabItem,
12
+ DataFrame as GRDataFrame,
13
+ )
14
+ from ambiguous_search_backends import AmbiguousSearchBackend
15
+ from classic_search_backends import ClassicSearchBackend
16
+
17
+
18
+ def launch_frontend(
19
+ ambiguous_search_backend: AmbiguousSearchBackend,
20
+ classic_search_backend: ClassicSearchBackend,
21
+ ):
22
+ def on_ambiguous_search_click(query: str):
23
+ result = ambiguous_search_backend.submit(query)
24
+ return GRDataFrame.update(value=result, visible=True)
25
+
26
+ def on_ambiguous_search_query_clear_click():
27
+ return Textbox.update(value="")
28
+
29
+ def on_classic_search_click(query: str):
30
+ result = classic_search_backend.submit(query)
31
+ return GRDataFrame.update(value=result, visible=True)
32
+
33
  with Blocks() as frontend:
34
+ Markdown("## Himitsudogu Search")
35
+ with Tabs():
36
+ with TabItem("あいまい検索"):
37
+ Markdown(
38
+ """
39
+ 自然言語処理モデル"Sentence BERT"を使って、ドラえもんのひみつ道具をあいまい検索。
40
+ """
41
+ )
42
+ query_textbox = Textbox(
43
+ label="自分の言葉でひみつ道具を説明してください", max_lines=1
44
+ )
45
+ with Row():
46
+ submit_button = Button("検索", variant="primary")
47
+ clear_button = Button("クリア")
48
+ result_table = GRDataFrame(visible=False)
49
+ Examples(
50
+ examples=[
51
+ "ふりかけると水を色々な性質にできる",
52
+ "小さいカメラが飛ぶ",
53
+ "壁を通り抜けられるようにする輪",
54
+ "地図をいじって実際の町の建物などの位置を変える",
55
+ "歌声を綺麗にする",
56
+ ],
57
+ inputs=query_textbox,
58
+ )
59
+
60
+ submit_button.click(
61
+ on_ambiguous_search_click,
62
+ inputs=query_textbox,
63
+ outputs=result_table,
64
+ )
65
+ clear_button.click(
66
+ on_ambiguous_search_query_clear_click,
67
+ inputs=None,
68
+ outputs=query_textbox,
69
+ )
70
+
71
+ with TabItem("古典検索"):
72
+ Markdown(
73
+ f"""
74
+ キーワードから検索。
75
+ 検索可能なひみつ道具は全{classic_search_backend.get_num_items()}種類。
76
+ """
77
+ )
78
+ query_textbox = Textbox(label="キーワード", max_lines=1)
79
+ with Row():
80
+ submit_button = Button("検索", variant="primary")
81
+ result_table = GRDataFrame(visible=False)
82
+
83
+ submit_button.click(
84
+ on_classic_search_click,
85
+ inputs=query_textbox,
86
+ outputs=result_table,
87
+ )
88
+
89
+ frontend.launch()
himitsudogu_db.pkl CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:0cb7f694ed31065ed801d932082a802042a3a224662c7cb45f01651d5ba43ccc
3
- size 20793904
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f87a588b29a0dfcfd4eb53710cec37613dd5d873c13772c6cb7b570ae5264a65
3
+ size 4203008
localtest.py DELETED
@@ -1,35 +0,0 @@
1
- from backend import Backend
2
- import frontend
3
- import numpy as np
4
- from pandas import DataFrame
5
- import pickle
6
-
7
- with open("./himitsudogu_db.pkl", "rb") as file:
8
- himitsudogu_db: dict = pickle.load(file)
9
-
10
- class LocaltestBackend(Backend):
11
- """
12
- ローカル開発環境にSentence BERTを連れ込んでくるのは無理なので、
13
- 文章をベクトル化する行程は乱数発生とする。
14
- """
15
-
16
- def __init__(self):
17
- super().__init__()
18
- self.feature_matrix = himitsudogu_db["feature_matrix_s"][
19
- "sonoisa/sentence-bert-base-ja-mean-tokens-v2"
20
- ]
21
-
22
- def on_submit_button_press(self, query: str) -> DataFrame:
23
- query_feature_vector = np.random.random((768,))
24
- cs_s = self.feature_matrix @ query_feature_vector
25
- ranked_index_s = np.argsort(cs_s)[::-1]
26
- output = DataFrame(columns=["類似度", "名前", "説明"])
27
- for rank, i in enumerate(ranked_index_s[:20], 1):
28
- output.loc[rank] = [
29
- cs_s[i],
30
- himitsudogu_db["name_s"][i],
31
- himitsudogu_db["description_s"][i],
32
- ]
33
- return output
34
-
35
- frontend.launch_frontend(backend=LocaltestBackend())
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
nobert_app.py ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ from ambiguous_search_backends import DummyAmbiguousSearchBackend
2
+ from classic_search_backends import ImplClassicSearchBackend
3
+ import frontend
4
+
5
+ frontend.launch_frontend(
6
+ ambiguous_search_backend=DummyAmbiguousSearchBackend(),
7
+ classic_search_backend=ImplClassicSearchBackend(),
8
+ )
old_himitsudogu_db/1.pkl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0cb7f694ed31065ed801d932082a802042a3a224662c7cb45f01651d5ba43ccc
3
+ size 20793904