Spaces:

retailcrmservices
/

omnidesk-ai-test

Runtime error

omnidesk-ai-test / embed_qa.py

makcrx

fix faiss index

d6a31a5 over 1 year ago

1.89 kB

	import sqlite3, json
	from contextlib import closing

	# change THIS
	output_dir = 'faiss_qa_2023-08-20'
	model_name = "multi-qa-MiniLM-L6-cos-v1"

	punctuation = '!"#\'(),:;?[]^`}{'
	punctuation2 = '-/&._~+*=@<>[]\\'
	remove_punctuation = str.maketrans(punctuation2, ' ' * len(punctuation2), punctuation)

	def load_questions(sqlite_filename):
	all_questions = []
	with closing(sqlite3.connect(sqlite_filename)) as db:
	db.row_factory = sqlite3.Row
	with closing(db.cursor()) as cursor:
	results = cursor.execute(
	"SELECT id, articleId, title, category, section, questions FROM articles WHERE articleType = ? AND doNotUse IS NULL OR doNotUse = 0",
	('article',)
	).fetchall()

	for res in results:
	section = res['section'].lower()
	title = res['title'].lower()
	if section == 'служебная информация':
	section = ''
	title = ''

	questions = json.loads(res['questions'])
	for q in questions:
	q['query'] = " ".join(section.split() + title.split() + q['question'].split()).translate(remove_punctuation).lower()
	q['articleId'] = res['articleId']
	all_questions += questions

	return all_questions

	print("Loading questions from db...")
	questions = load_questions("omnidesk-ai-chatgpt-questions.sqlite")

	# print(questions[0])

	from langchain.vectorstores import FAISS
	from langchain.docstore.document import Document
	from langchain.embeddings import SentenceTransformerEmbeddings

	docs = [
	Document(page_content=q['query'], metadata={ 'answer': q['answer'], 'articleId': q['articleId'] })
	for q in questions
	]

	print(f"Loading embeddings model {model_name}...")
	embeddings = SentenceTransformerEmbeddings(model_name=model_name)

	print("embedding documents...")

	db = FAISS.from_documents(docs, embeddings)
	db.save_local(output_dir)

	print('Saved!')