diff --git "a/notebooks/indexing_and_pipeline_creation.ipynb" "b/notebooks/indexing_and_pipeline_creation.ipynb" new file mode 100644--- /dev/null +++ "b/notebooks/indexing_and_pipeline_creation.ipynb" @@ -0,0 +1,10079 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "3922a573", + "metadata": { + "id": "3922a573" + }, + "source": [ + "# Indexing and pipeline creation\n", + "This notebook is inspired by [\"Build Your First QA System\" tutorial](https://haystack.deepset.ai/tutorials/first-qa-system), from Haystack documentation.\n", + "\n", + "Here we use a collection of articles about Twin Peaks to answer a variety of questions about that awesome TV series!\n", + "\n", + "The following steps are performed:\n", + "* load and preprocess data\n", + "* create document store and write documents\n", + "* initialize retriever and generate document embeddings\n", + "* initialize reader\n", + "* compose and try Question Answering pipeline\n", + "* save and export index" + ] + }, + { + "cell_type": "markdown", + "id": "viixGIJcKPSQ", + "metadata": { + "id": "viixGIJcKPSQ" + }, + "source": [ + "## Preliminary operations" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "id": "MevE4jEZ5QBT", + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "MevE4jEZ5QBT", + "outputId": "727734c9-8366-4686-cbe1-bf3d149db1c5" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Mounted at /content/drive\n" + ] + } + ], + "source": [ + "from google.colab import drive\n", + "drive.mount('/content/drive')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "VYWRJ-Lf55nV", + "metadata": { + "id": "VYWRJ-Lf55nV" + }, + "outputs": [], + "source": [ + "# install dependencies\n", + "! pip install farm-haystack[faiss-gpu]==1.4.0" + ] + }, + { + "cell_type": "markdown", + "id": "QVDuHAMIK4bg", + "metadata": { + "id": "QVDuHAMIK4bg" + }, + "source": [ + "## Load and preprocess data" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "id": "72139774", + "metadata": { + "execution": { + "iopub.execute_input": "2022-01-09T08:40:46.176031Z", + "iopub.status.busy": "2022-01-09T08:40:46.175755Z", + "iopub.status.idle": "2022-01-09T08:40:46.179554Z", + "shell.execute_reply": "2022-01-09T08:40:46.178704Z", + "shell.execute_reply.started": "2022-01-09T08:40:46.175959Z" + }, + "id": "72139774" + }, + "outputs": [], + "source": [ + "import glob, json" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "4421e328", + "metadata": { + "execution": { + "iopub.execute_input": "2022-01-09T08:40:47.846999Z", + "iopub.status.busy": "2022-01-09T08:40:47.846757Z", + "iopub.status.idle": "2022-01-09T08:40:48.327632Z", + "shell.execute_reply": "2022-01-09T08:40:48.326829Z", + "shell.execute_reply.started": "2022-01-09T08:40:47.846975Z" + }, + "id": "4421e328" + }, + "outputs": [], + "source": [ + "DATA_DIRECTORY = '/content/drive/MyDrive/Colab Notebooks/wklp/data'\n", + "\n", + "docs=[]\n", + "\n", + "for json_file in glob.glob(f'{DATA_DIRECTORY}/*.json'):\n", + " with open(json_file, 'r') as fin:\n", + " json_content=json.load(fin)\n", + " \n", + " doc={'content': json_content['text'],\n", + " 'meta': {'name': json_content['name'],\n", + " 'url': json_content['url']}}\n", + " docs.append(doc)" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "id": "GR6qWQAn72WG", + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "GR6qWQAn72WG", + "outputId": "6295735e-93ca-4047-8c64-727f2cdf5458" + }, + "outputs": [ + { + "data": { + "text/plain": [ + "1087" + ] + }, + "execution_count": 5, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "len(docs)" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "id": "aa231b94", + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "execution": { + "iopub.execute_input": "2022-01-09T08:40:48.796741Z", + "iopub.status.busy": "2022-01-09T08:40:48.796550Z", + "iopub.status.idle": "2022-01-09T08:40:48.805224Z", + "shell.execute_reply": "2022-01-09T08:40:48.804705Z", + "shell.execute_reply.started": "2022-01-09T08:40:48.796722Z" + }, + "id": "aa231b94", + "outputId": "af3c814e-3325-4c1b-e0a3-5314e1cf217a" + }, + "outputs": [ + { + "data": { + "text/plain": [ + "{'content': \"Pete Lindstrom\\nPete Lindstrom was a citizen of Twin Peaks, Washington who was killed in the Blizzard of 1889.\\nHis death was witnessed by Knut Zimmerman, who reported that wind had plunged a candle from the Annual Candlelighting and Christmas Tree Ceremony into the back of Lindstrom's head, killing him.\",\n", + " 'meta': {'name': 'Pete_Lindstrom',\n", + " 'url': 'https://twinpeaks.fandom.com/wiki/Pete_Lindstrom'}}" + ] + }, + "execution_count": 6, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "docs[5]" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "id": "bc5adb1c", + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "execution": { + "iopub.execute_input": "2022-01-09T08:41:04.538529Z", + "iopub.status.busy": "2022-01-09T08:41:04.538227Z", + "iopub.status.idle": "2022-01-09T08:41:05.147190Z", + "shell.execute_reply": "2022-01-09T08:41:05.146513Z", + "shell.execute_reply.started": "2022-01-09T08:41:04.538503Z" + }, + "id": "bc5adb1c", + "outputId": "2fc943a9-b55c-4f03-f577-343cb675e115" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[nltk_data] Downloading package punkt to /root/nltk_data...\n", + "[nltk_data] Unzipping tokenizers/punkt.zip.\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + " 0%| | 0/1087 [00:00\n" + ] + } + ], + "source": [ + "print(preprocessed_docs[5])\n" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "id": "b9PS0PkM_1EF", + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "b9PS0PkM_1EF", + "outputId": "659eef80-dfde-42cf-f944-08bf0bba73a5" + }, + "outputs": [ + { + "data": { + "text/plain": [ + "2825" + ] + }, + "execution_count": 10, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "len(preprocessed_docs)" + ] + }, + { + "cell_type": "markdown", + "id": "Yu3bAUPoLrPI", + "metadata": { + "id": "Yu3bAUPoLrPI" + }, + "source": [ + "## Create document store ([FAISS](https://github.com/facebookresearch/faiss)) and write documents\n", + "\n" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "id": "bfe846df", + "metadata": { + "execution": { + "iopub.execute_input": "2022-01-09T08:40:59.678181Z", + "iopub.status.busy": "2022-01-09T08:40:59.678003Z", + "iopub.status.idle": "2022-01-09T08:40:59.753228Z", + "shell.execute_reply": "2022-01-09T08:40:59.752500Z", + "shell.execute_reply.started": "2022-01-09T08:40:59.678161Z" + }, + "id": "bfe846df" + }, + "outputs": [], + "source": [ + "from haystack.document_stores import FAISSDocumentStore\n", + "\n", + "# the document store settings are those compatible with Embedding Retriever\n", + "document_store = FAISSDocumentStore(\n", + " similarity=\"dot_product\",\n", + " embedding_dim=768)" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "id": "191144b4", + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 49, + "referenced_widgets": [ + "ab82841df4bd4d4e89da2f5b48af7d90", + "7d18cadf709e49ceaa6f4b48dc950e1b", + "db134d0c7c05414e9a2eb6426c916ecc", + "3ea242b4bd3e4afc96ae707a08149792", + "395e4edafc02446383d577edf80a69da", + "7995fe6d56e54a8099755c2b584476db", + "3f8289ac8a7147ae9e4b30bd7c0aa174", + "6be0d76fc58540cd84d4b84c55b16200", + "a0734711249b45f0950d2f68d30b9b3a", + "a20a18a0d4a342249237c72d9e800e24", + "465495a0c471437ea28fa01d97c80cd8" + ] + }, + "execution": { + "iopub.execute_input": "2022-01-09T08:41:10.695292Z", + "iopub.status.busy": "2022-01-09T08:41:10.695064Z", + "iopub.status.idle": "2022-01-09T08:41:22.144864Z", + "shell.execute_reply": "2022-01-09T08:41:22.144203Z", + "shell.execute_reply.started": "2022-01-09T08:41:10.695271Z" + }, + "id": "191144b4", + "outputId": "2ce7622e-773e-4155-f7f4-f2b23c18d3c7" + }, + "outputs": [ + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "ab82841df4bd4d4e89da2f5b48af7d90", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "Writing Documents: 0%| | 0/2825 [00:00