import sqlite3, json from contextlib import closing # change THIS output_dir = 'faiss_qa_2023-08-20' model_name = "multi-qa-MiniLM-L6-cos-v1" punctuation = '!"#\'(),:;?[]^`}{' punctuation2 = '-/&._~+*=@<>[]\\' remove_punctuation = str.maketrans(punctuation2, ' ' * len(punctuation2), punctuation) def load_questions(sqlite_filename): all_questions = [] with closing(sqlite3.connect(sqlite_filename)) as db: db.row_factory = sqlite3.Row with closing(db.cursor()) as cursor: results = cursor.execute( "SELECT id, articleId, title, category, section, questions FROM articles WHERE articleType = ? AND doNotUse IS NULL OR doNotUse = 0", ('article',) ).fetchall() for res in results: section = res['section'].lower() title = res['title'].lower() if section == 'служебная информация': section = '' title = '' questions = json.loads(res['questions']) for q in questions: q['query'] = " ".join(section.split() + title.split() + q['question'].split()).translate(remove_punctuation).lower() q['articleId'] = res['articleId'] all_questions += questions return all_questions print("Loading questions from db...") questions = load_questions("omnidesk-ai-chatgpt-questions.sqlite") # print(questions[0]) from langchain.vectorstores import FAISS from langchain.docstore.document import Document from langchain.embeddings import SentenceTransformerEmbeddings docs = [ Document(page_content=q['query'], metadata={ 'answer': q['answer'], 'articleId': q['articleId'] }) for q in questions ] print(f"Loading embeddings model {model_name}...") embeddings = SentenceTransformerEmbeddings(model_name=model_name) print("embedding documents...") db = FAISS.from_documents(docs, embeddings) db.save_local(output_dir) print('Saved!')