{"cells":[{"cell_type":"markdown","id":"3922a573","metadata":{},"source":["# Index creation"]},{"cell_type":"markdown","id":"viixGIJcKPSQ","metadata":{"id":"viixGIJcKPSQ"},"source":["## Preliminary operations"]},{"cell_type":"code","execution_count":1,"id":"MevE4jEZ5QBT","metadata":{"colab":{"base_uri":"https://localhost:8080/"},"executionInfo":{"elapsed":25189,"status":"ok","timestamp":1652189481823,"user":{"displayName":"Stefano Fiorucci","userId":"12409279692445770059"},"user_tz":-120},"id":"MevE4jEZ5QBT","outputId":"d4b2a927-e000-442b-ebc6-0d40d8a165d6"},"outputs":[{"name":"stdout","output_type":"stream","text":["Mounted at /content/drive\n"]}],"source":["from google.colab import drive\n","drive.mount('/content/drive')"]},{"cell_type":"code","execution_count":null,"id":"VYWRJ-Lf55nV","metadata":{"colab":{"base_uri":"https://localhost:8080/"},"executionInfo":{"elapsed":161669,"status":"ok","timestamp":1652189651623,"user":{"displayName":"Stefano Fiorucci","userId":"12409279692445770059"},"user_tz":-120},"id":"VYWRJ-Lf55nV","outputId":"5c860ef6-d4cb-4293-d704-51454a3f88bf"},"outputs":[],"source":["# install dependencies\n","! pip install farm-haystack[faiss-gpu]"]},{"cell_type":"markdown","id":"QVDuHAMIK4bg","metadata":{"id":"QVDuHAMIK4bg"},"source":["## Load data"]},{"cell_type":"code","execution_count":3,"id":"72139774","metadata":{"execution":{"iopub.execute_input":"2022-01-09T08:40:46.176031Z","iopub.status.busy":"2022-01-09T08:40:46.175755Z","iopub.status.idle":"2022-01-09T08:40:46.179554Z","shell.execute_reply":"2022-01-09T08:40:46.178704Z","shell.execute_reply.started":"2022-01-09T08:40:46.175959Z"},"executionInfo":{"elapsed":32,"status":"ok","timestamp":1652189651625,"user":{"displayName":"Stefano Fiorucci","userId":"12409279692445770059"},"user_tz":-120},"id":"72139774"},"outputs":[],"source":["import glob\n","import json"]},{"cell_type":"code","execution_count":4,"id":"4421e328","metadata":{"execution":{"iopub.execute_input":"2022-01-09T08:40:47.846999Z","iopub.status.busy":"2022-01-09T08:40:47.846757Z","iopub.status.idle":"2022-01-09T08:40:48.327632Z","shell.execute_reply":"2022-01-09T08:40:48.326829Z","shell.execute_reply.started":"2022-01-09T08:40:47.846975Z"},"executionInfo":{"elapsed":24363,"status":"ok","timestamp":1652189675961,"user":{"displayName":"Stefano Fiorucci","userId":"12409279692445770059"},"user_tz":-120},"id":"4421e328"},"outputs":[],"source":["DATA_DIRECTORY = '/content/drive/MyDrive/Colab Notebooks/wklp/data'\n","\n","docs=[]\n","\n","for json_file in glob.glob(f'{DATA_DIRECTORY}/*.json'):\n"," with open(json_file, 'r') as fin:\n"," json_content=json.load(fin)\n"," \n"," doc={'content': json_content['text'],\n"," 'meta': {'name': json_content['name'],\n"," 'url': json_content['url']}}\n"," docs.append(doc)"]},{"cell_type":"code","execution_count":5,"id":"GR6qWQAn72WG","metadata":{"colab":{"base_uri":"https://localhost:8080/"},"executionInfo":{"elapsed":9,"status":"ok","timestamp":1652189679928,"user":{"displayName":"Stefano Fiorucci","userId":"12409279692445770059"},"user_tz":-120},"id":"GR6qWQAn72WG","outputId":"3e17336f-1145-43ff-c3ca-fab7604343d1"},"outputs":[{"data":{"text/plain":["1087"]},"execution_count":5,"metadata":{},"output_type":"execute_result"}],"source":["len(docs)"]},{"cell_type":"code","execution_count":6,"id":"aa231b94","metadata":{"colab":{"base_uri":"https://localhost:8080/"},"execution":{"iopub.execute_input":"2022-01-09T08:40:48.796741Z","iopub.status.busy":"2022-01-09T08:40:48.796550Z","iopub.status.idle":"2022-01-09T08:40:48.805224Z","shell.execute_reply":"2022-01-09T08:40:48.804705Z","shell.execute_reply.started":"2022-01-09T08:40:48.796722Z"},"executionInfo":{"elapsed":10,"status":"ok","timestamp":1652189681394,"user":{"displayName":"Stefano Fiorucci","userId":"12409279692445770059"},"user_tz":-120},"id":"aa231b94","outputId":"a42147fb-b9a4-4500-cc96-ce73177030f9"},"outputs":[{"data":{"text/plain":["{'content': \"Pete Lindstrom\\nPete Lindstrom was a citizen of Twin Peaks, Washington who was killed in the Blizzard of 1889.\\nHis death was witnessed by Knut Zimmerman, who reported that wind had plunged a candle from the Annual Candlelighting and Christmas Tree Ceremony into the back of Lindstrom's head, killing him.\",\n"," 'meta': {'name': 'Pete_Lindstrom',\n"," 'url': 'https://twinpeaks.fandom.com/wiki/Pete_Lindstrom'}}"]},"execution_count":6,"metadata":{},"output_type":"execute_result"}],"source":["docs[5]"]},{"cell_type":"markdown","id":"Yu3bAUPoLrPI","metadata":{"id":"Yu3bAUPoLrPI"},"source":["## Define document store ([FAISS](https://github.com/facebookresearch/faiss)) and write documents\n","\n"]},{"cell_type":"code","execution_count":8,"id":"bfe846df","metadata":{"colab":{"base_uri":"https://localhost:8080/"},"execution":{"iopub.execute_input":"2022-01-09T08:40:59.678181Z","iopub.status.busy":"2022-01-09T08:40:59.678003Z","iopub.status.idle":"2022-01-09T08:40:59.753228Z","shell.execute_reply":"2022-01-09T08:40:59.752500Z","shell.execute_reply.started":"2022-01-09T08:40:59.678161Z"},"executionInfo":{"elapsed":10410,"status":"ok","timestamp":1652190218453,"user":{"displayName":"Stefano Fiorucci","userId":"12409279692445770059"},"user_tz":-120},"id":"bfe846df","outputId":"187c2d40-470a-4f87-ab50-ec4082bccb33"},"outputs":[{"name":"stderr","output_type":"stream","text":["INFO - haystack.modeling.model.optimization - apex not found, won't use it. See https://nvidia.github.io/apex/\n","ERROR - root - Failed to import 'magic' (from 'python-magic' and 'python-magic-bin' on Windows). FileTypeClassifier will not perform mimetype detection on extensionless files. Please make sure the necessary OS libraries are installed if you need this functionality.\n","INFO - haystack.telemetry - Haystack sends anonymous usage data to understand the actual usage and steer dev efforts towards features that are most meaningful to users. You can opt-out at anytime by calling disable_telemetry() or by manually setting the environment variable HAYSTACK_TELEMETRY_ENABLED as described for different operating systems on the documentation page. More information at https://haystack.deepset.ai/guides/telemetry\n"]}],"source":["from haystack.document_stores import FAISSDocumentStore\n","\n","# the document store settings are those compatible with Embedding Retriever\n","document_store = FAISSDocumentStore(\n"," similarity=\"dot_product\",\n"," embedding_dim=768)"]},{"cell_type":"code","execution_count":9,"id":"bc5adb1c","metadata":{"colab":{"base_uri":"https://localhost:8080/"},"execution":{"iopub.execute_input":"2022-01-09T08:41:04.538529Z","iopub.status.busy":"2022-01-09T08:41:04.538227Z","iopub.status.idle":"2022-01-09T08:41:05.147190Z","shell.execute_reply":"2022-01-09T08:41:05.146513Z","shell.execute_reply.started":"2022-01-09T08:41:04.538503Z"},"executionInfo":{"elapsed":2085,"status":"ok","timestamp":1652190317389,"user":{"displayName":"Stefano Fiorucci","userId":"12409279692445770059"},"user_tz":-120},"id":"bc5adb1c","outputId":"4cc11a2d-5ce5-41c1-e5eb-a0ee411ab00b"},"outputs":[{"name":"stdout","output_type":"stream","text":["[nltk_data] Downloading package punkt to /root/nltk_data...\n","[nltk_data] Unzipping tokenizers/punkt.zip.\n"]},{"name":"stderr","output_type":"stream","text":[" 0%| | 0/1087 [00:00\n"]}],"source":["print(preprocessed_docs[5])\n"]},{"cell_type":"code","execution_count":12,"id":"b9PS0PkM_1EF","metadata":{"colab":{"base_uri":"https://localhost:8080/"},"executionInfo":{"elapsed":370,"status":"ok","timestamp":1652190343399,"user":{"displayName":"Stefano Fiorucci","userId":"12409279692445770059"},"user_tz":-120},"id":"b9PS0PkM_1EF","outputId":"25fba54f-46d9-4c53-b0c1-15e8a878cad0"},"outputs":[{"data":{"text/plain":["2825"]},"execution_count":12,"metadata":{},"output_type":"execute_result"}],"source":["len(preprocessed_docs)"]},{"cell_type":"code","execution_count":81,"id":"191144b4","metadata":{"colab":{"base_uri":"https://localhost:8080/","height":49,"referenced_widgets":["425730d860514e2d87c0870cbb943842","06c58f8fc29343fa96e36d5b1f8dd078","046fa73af99645cc88b49c0f3e5f96b7","e256a26a0f41436a9755c56f3ffebd11","1e2bf8bf2ab14c9e880c06b04f752a1b","1377c76f1051467fb391c2c0119b0634","4d4babe9fcb24dd7996ecbeb7006018f","ff4bc8be1b8041e6a116bc37e366bf96","e004a6c61f2d4e1d8e9d02c51dcc6ebd","88c675dce7bd4247842ffeb6470d31dd","1d447ec86fe84008b29495ecb78a7fac"]},"execution":{"iopub.execute_input":"2022-01-09T08:41:10.695292Z","iopub.status.busy":"2022-01-09T08:41:10.695064Z","iopub.status.idle":"2022-01-09T08:41:22.144864Z","shell.execute_reply":"2022-01-09T08:41:22.144203Z","shell.execute_reply.started":"2022-01-09T08:41:10.695271Z"},"executionInfo":{"elapsed":11491,"status":"ok","timestamp":1652179167100,"user":{"displayName":"Stefano Fiorucci","userId":"12409279692445770059"},"user_tz":-120},"id":"191144b4","outputId":"c30f2216-2c6c-4f28-867c-dfc0bd76bc09"},"outputs":[{"data":{"application/vnd.jupyter.widget-view+json":{"model_id":"425730d860514e2d87c0870cbb943842","version_major":2,"version_minor":0},"text/plain":["Writing Documents: 0%| | 0/2825 [00:00