## Imports

In [None]:
import pandas as pd

from haystack.nodes import PreProcessor, EmbeddingRetriever
from haystack.document_stores import FAISSDocumentStore
from haystack.utils import convert_files_to_docs

## Preprocess Documents

### BLAB-Wiki

In [None]:
preprocessor = PreProcessor(
    clean_empty_lines=True,
    clean_whitespace=True,
    clean_header_footer=False,
    split_by="sentence",
    split_length=2,
    split_overlap=1,
    split_respect_sentence_boundary=False)

all_docs = convert_files_to_docs(dir_path="./Fontes/Wiki_Pages/")
docs_default = preprocessor.process(all_docs)

### QA Source

In [None]:
# QA sentences
QA_path = "./Fontes/QA_Base/"

train = pd.read_parquet(QA_path + 'train.parquet')['new_long_answers']
test = pd.read_parquet(QA_path + 'test.parquet')['new_long_answers']
validation = pd.read_parquet(QA_path + 'validation.parquet')['new_long_answers']

answers = pd.concat([train,test,validation])

docs_list = [{"content": v, "content_type": "text", "score":None, "meta":None} for i,v in answers.items()]

## Create DocumentsStore and calculate Embeddings

In [None]:
document_store = FAISSDocumentStore(similarity="dot_product", embedding_dim=512)
document_store.write_documents(docs_default + docs_list)

In [None]:
retriever = EmbeddingRetriever(
    document_store=document_store, 
    embedding_model="sentence-transformers/distiluse-base-multilingual-cased-v1")

document_store.update_embeddings(retriever, batch_size=10000)