{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# Indexing\n", "Using [Haystack](https://github.com/deepset-ai/haystack), the following steps are performed:\n", "- load and preprocess documents downloaded from Wikipedia\n", "- create document store and write documents\n", "- initialize retriever and generate document embeddings" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "_cell_guid": "b1076dfc-b9ad-4769-8c92-a6c4dae69d19", "_uuid": "8f2839f25d086af736a60e9eeb907d3b93b6e0e5", "trusted": true }, "outputs": [], "source": [ "! pip install farm-haystack[faiss-gpu]==1.7.0" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Load documents" ] }, { "cell_type": "code", "execution_count": 2, "metadata": { "execution": { "iopub.execute_input": "2022-08-21T08:23:23.692554Z", "iopub.status.busy": "2022-08-21T08:23:23.692208Z", "iopub.status.idle": "2022-08-21T08:23:23.700721Z", "shell.execute_reply": "2022-08-21T08:23:23.698130Z", "shell.execute_reply.started": "2022-08-21T08:23:23.692512Z" }, "trusted": true }, "outputs": [], "source": [ "import glob, json" ] }, { "cell_type": "code", "execution_count": 3, "metadata": { "execution": { "iopub.execute_input": "2022-08-21T08:23:23.707774Z", "iopub.status.busy": "2022-08-21T08:23:23.704107Z", "iopub.status.idle": "2022-08-21T08:23:25.026910Z", "shell.execute_reply": "2022-08-21T08:23:25.025990Z", "shell.execute_reply.started": "2022-08-21T08:23:23.705010Z" }, "trusted": true }, "outputs": [], "source": [ "docs = []\n", "\n", "for json_file in glob.glob(\"../input/crawl-rock/rock_wiki/*.json\"):\n", " with open(json_file, \"r\") as fin:\n", " doc = json.load(fin)\n", "\n", " docs.append(doc)" ] }, { "cell_type": "code", "execution_count": 4, "metadata": { "execution": { "iopub.execute_input": "2022-08-21T08:23:25.030530Z", "iopub.status.busy": "2022-08-21T08:23:25.029931Z", "iopub.status.idle": "2022-08-21T08:23:25.039324Z", "shell.execute_reply": "2022-08-21T08:23:25.037960Z", "shell.execute_reply.started": "2022-08-21T08:23:25.030491Z" }, "trusted": true }, "outputs": [ { "data": { "text/plain": [ "453" ] }, "execution_count": 4, "metadata": {}, "output_type": "execute_result" } ], "source": [ "len(docs)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Preprocess documents" ] }, { "cell_type": "code", "execution_count": 6, "metadata": { "execution": { "iopub.execute_input": "2022-08-21T08:23:25.050479Z", "iopub.status.busy": "2022-08-21T08:23:25.050099Z", "iopub.status.idle": "2022-08-21T08:23:42.089083Z", "shell.execute_reply": "2022-08-21T08:23:42.087929Z", "shell.execute_reply.started": "2022-08-21T08:23:25.050446Z" }, "trusted": true }, "outputs": [ { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "108e8c46426f44e7be98a8ae930d81ce", "version_major": 2, "version_minor": 0 }, "text/plain": [ "Preprocessing: 0%| | 0/453 [00:00,\n", " ,\n", " ,\n", " ,\n", " ,\n", " ,\n", " ,\n", " ,\n", " ,\n", " ]" ] }, "execution_count": 8, "metadata": {}, "output_type": "execute_result" } ], "source": [ "preprocessed_docs[:10]" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# select only documents with at least 10 words. Otherwise, the documents are not very informative\n", "preprocessed_docs = [doc for doc in preprocessed_docs if len(doc.content.split()) >= 10]" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Create document store ([FAISS](https://github.com/facebookresearch/faiss)) and write documents" ] }, { "cell_type": "code", "execution_count": 9, "metadata": { "execution": { "iopub.execute_input": "2022-08-21T08:23:42.119585Z", "iopub.status.busy": "2022-08-21T08:23:42.118544Z", "iopub.status.idle": "2022-08-21T08:23:42.124669Z", "shell.execute_reply": "2022-08-21T08:23:42.123597Z", "shell.execute_reply.started": "2022-08-21T08:23:42.119551Z" }, "trusted": true }, "outputs": [], "source": [ "from haystack.document_stores import FAISSDocumentStore\n", "from haystack.nodes import EmbeddingRetriever" ] }, { "cell_type": "code", "execution_count": 10, "metadata": { "execution": { "iopub.execute_input": "2022-08-21T08:23:42.129562Z", "iopub.status.busy": "2022-08-21T08:23:42.128772Z", "iopub.status.idle": "2022-08-21T08:23:42.259879Z", "shell.execute_reply": "2022-08-21T08:23:42.258950Z", "shell.execute_reply.started": "2022-08-21T08:23:42.129518Z" }, "trusted": true }, "outputs": [], "source": [ "# the document store settings are those compatible with Embedding Retriever\n", "document_store = FAISSDocumentStore(similarity=\"dot_product\", embedding_dim=768)" ] }, { "cell_type": "code", "execution_count": 46, "metadata": { "execution": { "iopub.execute_input": "2022-08-21T08:43:25.952230Z", "iopub.status.busy": "2022-08-21T08:43:25.951856Z", "iopub.status.idle": "2022-08-21T08:46:12.506842Z", "shell.execute_reply": "2022-08-21T08:46:12.505845Z", "shell.execute_reply.started": "2022-08-21T08:43:25.952198Z" }, "trusted": true }, "outputs": [ { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "dbd72ecf0d36401ba26826f7d9a42540", "version_major": 2, "version_minor": 0 }, "text/plain": [ "Writing Documents: 0%| | 0/50024 [00:00