Spaces:
Runtime error
Runtime error
makcrx
commited on
Commit
•
91856d0
1
Parent(s):
fdc26d9
2023-08-20
Browse files- test.py → embed_qa.py +4 -12
- faiss_qa_2023-08-20/index.faiss +3 -0
- faiss_qa_2023-08-20/index.pkl +3 -0
- test.ipynb +5 -25
test.py → embed_qa.py
RENAMED
@@ -2,21 +2,13 @@ import sqlite3, json
|
|
2 |
from contextlib import closing
|
3 |
|
4 |
# change THIS
|
5 |
-
output_dir = 'faiss_qa_2023-08-
|
6 |
model_name = "multi-qa-MiniLM-L6-cos-v1"
|
7 |
|
8 |
punctuation = '!"#\'(),:;?[]^`}{'
|
9 |
punctuation2 = '-/&._~+*=@<>[]\\'
|
10 |
remove_punctuation = str.maketrans(punctuation2, ' ' * len(punctuation2), punctuation)
|
11 |
|
12 |
-
def add_special_questions(questions):
|
13 |
-
questions.append({
|
14 |
-
"question": "Позови человека/менеджера",
|
15 |
-
"query": "Позови человека/менеджера",
|
16 |
-
"answer": "Переключаю на сотрудника, ожидайте",
|
17 |
-
"articleId": 0,
|
18 |
-
})
|
19 |
-
|
20 |
def load_questions(sqlite_filename):
|
21 |
all_questions = []
|
22 |
with closing(sqlite3.connect(sqlite_filename)) as db:
|
@@ -28,15 +20,15 @@ def load_questions(sqlite_filename):
|
|
28 |
).fetchall()
|
29 |
|
30 |
for res in results:
|
31 |
-
|
|
|
|
|
32 |
questions = json.loads(res['questions'])
|
33 |
for q in questions:
|
34 |
q['query'] = " ".join(res['section'].split() + res['title'].split() + q['question'].split()).translate(remove_punctuation).lower()
|
35 |
q['articleId'] = res['articleId']
|
36 |
all_questions += questions
|
37 |
|
38 |
-
add_special_questions(all_questions)
|
39 |
-
|
40 |
return all_questions
|
41 |
|
42 |
print("Loading questions from db...")
|
|
|
2 |
from contextlib import closing
|
3 |
|
4 |
# change THIS
|
5 |
+
output_dir = 'faiss_qa_2023-08-20'
|
6 |
model_name = "multi-qa-MiniLM-L6-cos-v1"
|
7 |
|
8 |
punctuation = '!"#\'(),:;?[]^`}{'
|
9 |
punctuation2 = '-/&._~+*=@<>[]\\'
|
10 |
remove_punctuation = str.maketrans(punctuation2, ' ' * len(punctuation2), punctuation)
|
11 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
12 |
def load_questions(sqlite_filename):
|
13 |
all_questions = []
|
14 |
with closing(sqlite3.connect(sqlite_filename)) as db:
|
|
|
20 |
).fetchall()
|
21 |
|
22 |
for res in results:
|
23 |
+
if res['section'].lower() == 'служебные ответы':
|
24 |
+
res['section'] = ''
|
25 |
+
|
26 |
questions = json.loads(res['questions'])
|
27 |
for q in questions:
|
28 |
q['query'] = " ".join(res['section'].split() + res['title'].split() + q['question'].split()).translate(remove_punctuation).lower()
|
29 |
q['articleId'] = res['articleId']
|
30 |
all_questions += questions
|
31 |
|
|
|
|
|
32 |
return all_questions
|
33 |
|
34 |
print("Loading questions from db...")
|
faiss_qa_2023-08-20/index.faiss
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:8a8a5e3d0342187d57b9a80c431b59019c99e2ce85fdc32666a0578b958efd3b
|
3 |
+
size 583725
|
faiss_qa_2023-08-20/index.pkl
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:d1da6f468c5ba954ba92921fa207550ba693c009ae1b6ec132d3a911e52d4f5f
|
3 |
+
size 267292
|
test.ipynb
CHANGED
@@ -67,18 +67,10 @@
|
|
67 |
"cell_type": "code",
|
68 |
"execution_count": 1,
|
69 |
"metadata": {},
|
70 |
-
"outputs": [
|
71 |
-
{
|
72 |
-
"name": "stderr",
|
73 |
-
"output_type": "stream",
|
74 |
-
"text": [
|
75 |
-
"2023-08-07 17:36:37.358149: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Could not find TensorRT\n"
|
76 |
-
]
|
77 |
-
}
|
78 |
-
],
|
79 |
"source": [
|
80 |
-
"from extract_keywords import canonical_keywords, merge_keywords, tokenize_sentence, extract_keywords, init_keyword_extractor\n",
|
81 |
-
"init_keyword_extractor()"
|
82 |
]
|
83 |
},
|
84 |
{
|
@@ -86,28 +78,16 @@
|
|
86 |
"execution_count": 5,
|
87 |
"metadata": {},
|
88 |
"outputs": [
|
89 |
-
{
|
90 |
-
"name": "stderr",
|
91 |
-
"output_type": "stream",
|
92 |
-
"text": [
|
93 |
-
"/home/makcrx/anaconda3/lib/python3.10/site-packages/sklearn/feature_extraction/text.py:528: UserWarning: The parameter 'token_pattern' will not be used since 'tokenizer' is not None'\n",
|
94 |
-
" warnings.warn(\n",
|
95 |
-
"/home/makcrx/anaconda3/lib/python3.10/site-packages/sklearn/feature_extraction/text.py:528: UserWarning: The parameter 'token_pattern' will not be used since 'tokenizer' is not None'\n",
|
96 |
-
" warnings.warn(\n"
|
97 |
-
]
|
98 |
-
},
|
99 |
{
|
100 |
"name": "stdout",
|
101 |
"output_type": "stream",
|
102 |
"text": [
|
103 |
-
"['
|
104 |
-
"[]\n"
|
105 |
]
|
106 |
}
|
107 |
],
|
108 |
"source": [
|
109 |
-
"print(
|
110 |
-
"print(extract_keywords('яндекс.доставка'))"
|
111 |
]
|
112 |
},
|
113 |
{
|
|
|
67 |
"cell_type": "code",
|
68 |
"execution_count": 1,
|
69 |
"metadata": {},
|
70 |
+
"outputs": [],
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
71 |
"source": [
|
72 |
+
"from extract_keywords import normalize_word, canonical_keywords, merge_keywords, tokenize_sentence, extract_keywords, init_keyword_extractor\n",
|
73 |
+
"#init_keyword_extractor()"
|
74 |
]
|
75 |
},
|
76 |
{
|
|
|
78 |
"execution_count": 5,
|
79 |
"metadata": {},
|
80 |
"outputs": [
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
81 |
{
|
82 |
"name": "stdout",
|
83 |
"output_type": "stream",
|
84 |
"text": [
|
85 |
+
"['почта', 'россия']\n"
|
|
|
86 |
]
|
87 |
}
|
88 |
],
|
89 |
"source": [
|
90 |
+
"print(tokenize_sentence('почты росии'))"
|
|
|
91 |
]
|
92 |
},
|
93 |
{
|