#!/usr/bin/env python3 # -*- coding: utf-8 -*- """ Created on Tue Dec 27 14:50:23 2022 @author: saeed """ import os from haystack.document_stores import FAISSDocumentStore from haystack.nodes import DensePassageRetriever from haystack.utils import convert_files_to_docs, clean_wiki_text module_dir = os.path.dirname(os.path.abspath(__file__)) os.chdir(module_dir) doc_dir = "data/wiki_gameofthrones_txt12" sql_file = 'faiss_doc_store.db' faiss_file = 'faiss_index.faiss' # %% Download/Load Docs # Get some files that we want to use # s3_url = "https://s3.eu-central-1.amazonaws.com/deepset.ai-farm-qa/datasets/documents/wiki_gameofthrones_txt12.zip" # fetch_archive_from_http(url=s3_url, output_dir=doc_dir) print('---> Loading Documents ...') # Convert files to docs + cleaning docs = convert_files_to_docs(dir_path=doc_dir, clean_func=clean_wiki_text, split_paragraphs=True) # %% Document Store print('---> Creating document store ...') document_store = FAISSDocumentStore(embedding_dim=128, faiss_index_factory_str="Flat", sql_url=f"sqlite:///{sql_file}") # %% Retriever (DPR) print('---> Initializing retriever ...') retriever = DensePassageRetriever( document_store=document_store, query_embedding_model="vblagoje/dpr-question_encoder-single-lfqa-wiki", passage_embedding_model="vblagoje/dpr-ctx_encoder-single-lfqa-wiki", use_gpu=True ) # %% Create Embeddings and save results document_store.update_embeddings(retriever) print('---> Saving results ...') # update db document_store.write_documents(docs) # save faiss file document_store.save(faiss_file) print('Done!')