omnidesk-ai-test / embed_qa.py
makcrx
fix faiss index
d6a31a5
raw
history blame
1.89 kB
import sqlite3, json
from contextlib import closing
# change THIS
output_dir = 'faiss_qa_2023-08-20'
model_name = "multi-qa-MiniLM-L6-cos-v1"
punctuation = '!"#\'(),:;?[]^`}{'
punctuation2 = '-/&._~+*=@<>[]\\'
remove_punctuation = str.maketrans(punctuation2, ' ' * len(punctuation2), punctuation)
def load_questions(sqlite_filename):
all_questions = []
with closing(sqlite3.connect(sqlite_filename)) as db:
db.row_factory = sqlite3.Row
with closing(db.cursor()) as cursor:
results = cursor.execute(
"SELECT id, articleId, title, category, section, questions FROM articles WHERE articleType = ? AND doNotUse IS NULL OR doNotUse = 0",
('article',)
).fetchall()
for res in results:
section = res['section'].lower()
title = res['title'].lower()
if section == 'служебная информация':
section = ''
title = ''
questions = json.loads(res['questions'])
for q in questions:
q['query'] = " ".join(section.split() + title.split() + q['question'].split()).translate(remove_punctuation).lower()
q['articleId'] = res['articleId']
all_questions += questions
return all_questions
print("Loading questions from db...")
questions = load_questions("omnidesk-ai-chatgpt-questions.sqlite")
# print(questions[0])
from langchain.vectorstores import FAISS
from langchain.docstore.document import Document
from langchain.embeddings import SentenceTransformerEmbeddings
docs = [
Document(page_content=q['query'], metadata={ 'answer': q['answer'], 'articleId': q['articleId'] })
for q in questions
]
print(f"Loading embeddings model {model_name}...")
embeddings = SentenceTransformerEmbeddings(model_name=model_name)
print("embedding documents...")
db = FAISS.from_documents(docs, embeddings)
db.save_local(output_dir)
print('Saved!')