Spaces:
Sleeping
Sleeping
File size: 2,955 Bytes
2d67dd4 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 |
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Imports"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"import pandas as pd\n",
"\n",
"from haystack.nodes import PreProcessor, EmbeddingRetriever\n",
"from haystack.document_stores import FAISSDocumentStore\n",
"from haystack.utils import convert_files_to_docs"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Preprocess Documents"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### BLAB-Wiki"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"preprocessor = PreProcessor(\n",
" clean_empty_lines=True,\n",
" clean_whitespace=True,\n",
" clean_header_footer=False,\n",
" split_by=\"sentence\",\n",
" split_length=2,\n",
" split_overlap=1,\n",
" split_respect_sentence_boundary=False)\n",
"\n",
"all_docs = convert_files_to_docs(dir_path=\"./Fontes/Wiki_Pages/\")\n",
"docs_default = preprocessor.process(all_docs)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### QA Source"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# QA sentences\n",
"QA_path = \"./Fontes/QA_Base/\"\n",
"\n",
"train = pd.read_parquet(QA_path + 'train.parquet')['new_long_answers']\n",
"test = pd.read_parquet(QA_path + 'test.parquet')['new_long_answers']\n",
"validation = pd.read_parquet(QA_path + 'validation.parquet')['new_long_answers']\n",
"\n",
"answers = pd.concat([train,test,validation])\n",
"\n",
"docs_list = [{\"content\": v, \"content_type\": \"text\", \"score\":None, \"meta\":None} for i,v in answers.items()]"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Create DocumentsStore and calculate Embeddings"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"document_store = FAISSDocumentStore(similarity=\"dot_product\", embedding_dim=512)\n",
"document_store.write_documents(docs_default + docs_list)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"retriever = EmbeddingRetriever(\n",
" document_store=document_store, \n",
" embedding_model=\"sentence-transformers/distiluse-base-multilingual-cased-v1\")\n",
"\n",
"document_store.update_embeddings(retriever, batch_size=10000)"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"name": "python",
"version": "3.10.12"
}
},
"nbformat": 4,
"nbformat_minor": 2
}
|