{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "## Imports" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "import pandas as pd\n", "\n", "from haystack.nodes import PreProcessor, EmbeddingRetriever\n", "from haystack.document_stores import FAISSDocumentStore\n", "from haystack.utils import convert_files_to_docs" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Preprocess Documents" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### BLAB-Wiki" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "preprocessor = PreProcessor(\n", " clean_empty_lines=True,\n", " clean_whitespace=True,\n", " clean_header_footer=False,\n", " split_by=\"sentence\",\n", " split_length=2,\n", " split_overlap=1,\n", " split_respect_sentence_boundary=False)\n", "\n", "all_docs = convert_files_to_docs(dir_path=\"./Fontes/Wiki_Pages/\")\n", "docs_default = preprocessor.process(all_docs)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### QA Source" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# QA sentences\n", "QA_path = \"./Fontes/QA_Base/\"\n", "\n", "train = pd.read_parquet(QA_path + 'train.parquet')['new_long_answers']\n", "test = pd.read_parquet(QA_path + 'test.parquet')['new_long_answers']\n", "validation = pd.read_parquet(QA_path + 'validation.parquet')['new_long_answers']\n", "\n", "answers = pd.concat([train,test,validation])\n", "\n", "docs_list = [{\"content\": v, \"content_type\": \"text\", \"score\":None, \"meta\":None} for i,v in answers.items()]" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Create DocumentsStore and calculate Embeddings" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "document_store = FAISSDocumentStore(similarity=\"dot_product\", embedding_dim=512)\n", "document_store.write_documents(docs_default + docs_list)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "retriever = EmbeddingRetriever(\n", " document_store=document_store, \n", " embedding_model=\"sentence-transformers/distiluse-base-multilingual-cased-v1\")\n", "\n", "document_store.update_embeddings(retriever, batch_size=10000)" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "name": "python", "version": "3.10.12" } }, "nbformat": 4, "nbformat_minor": 2 }