makcrx commited on
Commit
91856d0
1 Parent(s): fdc26d9

2023-08-20

Browse files
test.py → embed_qa.py RENAMED
@@ -2,21 +2,13 @@ import sqlite3, json
2
  from contextlib import closing
3
 
4
  # change THIS
5
- output_dir = 'faiss_qa_2023-08-09'
6
  model_name = "multi-qa-MiniLM-L6-cos-v1"
7
 
8
  punctuation = '!"#\'(),:;?[]^`}{'
9
  punctuation2 = '-/&._~+*=@<>[]\\'
10
  remove_punctuation = str.maketrans(punctuation2, ' ' * len(punctuation2), punctuation)
11
 
12
- def add_special_questions(questions):
13
- questions.append({
14
- "question": "Позови человека/менеджера",
15
- "query": "Позови человека/менеджера",
16
- "answer": "Переключаю на сотрудника, ожидайте",
17
- "articleId": 0,
18
- })
19
-
20
  def load_questions(sqlite_filename):
21
  all_questions = []
22
  with closing(sqlite3.connect(sqlite_filename)) as db:
@@ -28,15 +20,15 @@ def load_questions(sqlite_filename):
28
  ).fetchall()
29
 
30
  for res in results:
31
-
 
 
32
  questions = json.loads(res['questions'])
33
  for q in questions:
34
  q['query'] = " ".join(res['section'].split() + res['title'].split() + q['question'].split()).translate(remove_punctuation).lower()
35
  q['articleId'] = res['articleId']
36
  all_questions += questions
37
 
38
- add_special_questions(all_questions)
39
-
40
  return all_questions
41
 
42
  print("Loading questions from db...")
 
2
  from contextlib import closing
3
 
4
  # change THIS
5
+ output_dir = 'faiss_qa_2023-08-20'
6
  model_name = "multi-qa-MiniLM-L6-cos-v1"
7
 
8
  punctuation = '!"#\'(),:;?[]^`}{'
9
  punctuation2 = '-/&._~+*=@<>[]\\'
10
  remove_punctuation = str.maketrans(punctuation2, ' ' * len(punctuation2), punctuation)
11
 
 
 
 
 
 
 
 
 
12
  def load_questions(sqlite_filename):
13
  all_questions = []
14
  with closing(sqlite3.connect(sqlite_filename)) as db:
 
20
  ).fetchall()
21
 
22
  for res in results:
23
+ if res['section'].lower() == 'служебные ответы':
24
+ res['section'] = ''
25
+
26
  questions = json.loads(res['questions'])
27
  for q in questions:
28
  q['query'] = " ".join(res['section'].split() + res['title'].split() + q['question'].split()).translate(remove_punctuation).lower()
29
  q['articleId'] = res['articleId']
30
  all_questions += questions
31
 
 
 
32
  return all_questions
33
 
34
  print("Loading questions from db...")
faiss_qa_2023-08-20/index.faiss ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8a8a5e3d0342187d57b9a80c431b59019c99e2ce85fdc32666a0578b958efd3b
3
+ size 583725
faiss_qa_2023-08-20/index.pkl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d1da6f468c5ba954ba92921fa207550ba693c009ae1b6ec132d3a911e52d4f5f
3
+ size 267292
test.ipynb CHANGED
@@ -67,18 +67,10 @@
67
  "cell_type": "code",
68
  "execution_count": 1,
69
  "metadata": {},
70
- "outputs": [
71
- {
72
- "name": "stderr",
73
- "output_type": "stream",
74
- "text": [
75
- "2023-08-07 17:36:37.358149: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Could not find TensorRT\n"
76
- ]
77
- }
78
- ],
79
  "source": [
80
- "from extract_keywords import canonical_keywords, merge_keywords, tokenize_sentence, extract_keywords, init_keyword_extractor\n",
81
- "init_keyword_extractor()"
82
  ]
83
  },
84
  {
@@ -86,28 +78,16 @@
86
  "execution_count": 5,
87
  "metadata": {},
88
  "outputs": [
89
- {
90
- "name": "stderr",
91
- "output_type": "stream",
92
- "text": [
93
- "/home/makcrx/anaconda3/lib/python3.10/site-packages/sklearn/feature_extraction/text.py:528: UserWarning: The parameter 'token_pattern' will not be used since 'tokenizer' is not None'\n",
94
- " warnings.warn(\n",
95
- "/home/makcrx/anaconda3/lib/python3.10/site-packages/sklearn/feature_extraction/text.py:528: UserWarning: The parameter 'token_pattern' will not be used since 'tokenizer' is not None'\n",
96
- " warnings.warn(\n"
97
- ]
98
- },
99
  {
100
  "name": "stdout",
101
  "output_type": "stream",
102
  "text": [
103
- "['яндекс доставка экспресс']\n",
104
- "[]\n"
105
  ]
106
  }
107
  ],
108
  "source": [
109
- "print(extract_keywords('яд экспресс'))\n",
110
- "print(extract_keywords('яндекс.доставка'))"
111
  ]
112
  },
113
  {
 
67
  "cell_type": "code",
68
  "execution_count": 1,
69
  "metadata": {},
70
+ "outputs": [],
 
 
 
 
 
 
 
 
71
  "source": [
72
+ "from extract_keywords import normalize_word, canonical_keywords, merge_keywords, tokenize_sentence, extract_keywords, init_keyword_extractor\n",
73
+ "#init_keyword_extractor()"
74
  ]
75
  },
76
  {
 
78
  "execution_count": 5,
79
  "metadata": {},
80
  "outputs": [
 
 
 
 
 
 
 
 
 
 
81
  {
82
  "name": "stdout",
83
  "output_type": "stream",
84
  "text": [
85
+ "['почта', 'россия']\n"
 
86
  ]
87
  }
88
  ],
89
  "source": [
90
+ "print(tokenize_sentence('почты росии'))"
 
91
  ]
92
  },
93
  {