makcrx commited on
Commit
9647155
1 Parent(s): 3334b06
app.py CHANGED
@@ -5,7 +5,7 @@ import reranking
5
  from extract_keywords import init_keyword_extractor, extract_keywords
6
 
7
  embeddings = SentenceTransformerEmbeddings(model_name="multi-qa-MiniLM-L6-cos-v1")
8
- db = FAISS.load_local('faiss_qa', embeddings)
9
  init_keyword_extractor()
10
 
11
  def main(query):
 
5
  from extract_keywords import init_keyword_extractor, extract_keywords
6
 
7
  embeddings = SentenceTransformerEmbeddings(model_name="multi-qa-MiniLM-L6-cos-v1")
8
+ db = FAISS.load_local('faiss_qa_2023-08-09', embeddings)
9
  init_keyword_extractor()
10
 
11
  def main(query):
faiss_qa_2023-08-09/index.faiss ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7690fb7aa21b8d325e3ce1a9f8fb241dc597aa06df042bf242c522433243b93f
3
+ size 576045
faiss_qa_2023-08-09/index.pkl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:09f3705caef79861035ed026855cefb872c386e87938d73a5deb57479507364d
3
+ size 265781
test.ipynb CHANGED
@@ -9,6 +9,7 @@
9
  "import sqlite3, json\n",
10
  "from contextlib import closing\n",
11
  "\n",
 
12
  "def load_questions(sqlite_filename):\n",
13
  " all_questions = []\n",
14
  " with closing(sqlite3.connect(sqlite_filename)) as db:\n",
@@ -82,30 +83,31 @@
82
  },
83
  {
84
  "cell_type": "code",
85
- "execution_count": 2,
86
  "metadata": {},
87
  "outputs": [
88
  {
89
  "name": "stderr",
90
  "output_type": "stream",
91
  "text": [
 
 
92
  "/home/makcrx/anaconda3/lib/python3.10/site-packages/sklearn/feature_extraction/text.py:528: UserWarning: The parameter 'token_pattern' will not be used since 'tokenizer' is not None'\n",
93
  " warnings.warn(\n"
94
  ]
95
  },
96
  {
97
- "data": {
98
- "text/plain": [
99
- "['почта россия трекинг']"
100
- ]
101
- },
102
- "execution_count": 2,
103
- "metadata": {},
104
- "output_type": "execute_result"
105
  }
106
  ],
107
  "source": [
108
- "extract_keywords('пр трекинг')"
 
109
  ]
110
  },
111
  {
 
9
  "import sqlite3, json\n",
10
  "from contextlib import closing\n",
11
  "\n",
12
+ "# use test.py to update questions in db!!!\n",
13
  "def load_questions(sqlite_filename):\n",
14
  " all_questions = []\n",
15
  " with closing(sqlite3.connect(sqlite_filename)) as db:\n",
 
83
  },
84
  {
85
  "cell_type": "code",
86
+ "execution_count": 5,
87
  "metadata": {},
88
  "outputs": [
89
  {
90
  "name": "stderr",
91
  "output_type": "stream",
92
  "text": [
93
+ "/home/makcrx/anaconda3/lib/python3.10/site-packages/sklearn/feature_extraction/text.py:528: UserWarning: The parameter 'token_pattern' will not be used since 'tokenizer' is not None'\n",
94
+ " warnings.warn(\n",
95
  "/home/makcrx/anaconda3/lib/python3.10/site-packages/sklearn/feature_extraction/text.py:528: UserWarning: The parameter 'token_pattern' will not be used since 'tokenizer' is not None'\n",
96
  " warnings.warn(\n"
97
  ]
98
  },
99
  {
100
+ "name": "stdout",
101
+ "output_type": "stream",
102
+ "text": [
103
+ "['яндекс доставка экспресс']\n",
104
+ "[]\n"
105
+ ]
 
 
106
  }
107
  ],
108
  "source": [
109
+ "print(extract_keywords('яд экспресс'))\n",
110
+ "print(extract_keywords('яндекс.доставка'))"
111
  ]
112
  },
113
  {
test.py ADDED
@@ -0,0 +1,64 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import sqlite3, json
2
+ from contextlib import closing
3
+
4
+ # change THIS
5
+ output_dir = 'faiss_qa_2023-08-09'
6
+ model_name = "multi-qa-MiniLM-L6-cos-v1"
7
+
8
+ punctuation = '!"#\'(),:;?[]^`}{'
9
+ punctuation2 = '-/&._~+*=@<>[]\\'
10
+ remove_punctuation = str.maketrans(punctuation2, ' ' * len(punctuation2), punctuation)
11
+
12
+ def add_special_questions(questions):
13
+ questions.append({
14
+ "question": "Позови человека/менеджера",
15
+ "query": "Позови человека/менеджера",
16
+ "answer": "Переключаю на сотрудника, ожидайте",
17
+ "articleId": 0,
18
+ })
19
+
20
+ def load_questions(sqlite_filename):
21
+ all_questions = []
22
+ with closing(sqlite3.connect(sqlite_filename)) as db:
23
+ db.row_factory = sqlite3.Row
24
+ with closing(db.cursor()) as cursor:
25
+ results = cursor.execute(
26
+ "SELECT id, articleId, title, category, section, questions FROM articles WHERE articleType = ? AND doNotUse IS NULL OR doNotUse = 0",
27
+ ('article',)
28
+ ).fetchall()
29
+
30
+ for res in results:
31
+
32
+ questions = json.loads(res['questions'])
33
+ for q in questions:
34
+ q['query'] = " ".join(res['section'].split() + res['title'].split() + q['question'].split()).translate(remove_punctuation).lower()
35
+ q['articleId'] = res['articleId']
36
+ all_questions += questions
37
+
38
+ add_special_questions(all_questions)
39
+
40
+ return all_questions
41
+
42
+ print("Loading questions from db...")
43
+ questions = load_questions("omnidesk-ai-chatgpt-questions.sqlite")
44
+
45
+ # print(questions[0])
46
+
47
+ from langchain.vectorstores import FAISS
48
+ from langchain.docstore.document import Document
49
+ from langchain.embeddings import SentenceTransformerEmbeddings
50
+
51
+ docs = [
52
+ Document(page_content=q['query'], metadata={ 'answer': q['answer'], 'articleId': q['articleId'] })
53
+ for q in questions
54
+ ]
55
+
56
+ print(f"Loading embeddings model {model_name}...")
57
+ embeddings = SentenceTransformerEmbeddings(model_name=model_name)
58
+
59
+ print("embedding documents...")
60
+
61
+ db = FAISS.from_documents(docs, embeddings)
62
+ db.save_local(output_dir)
63
+
64
+ print('Saved!')