{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "### Notebook for creating the documents based on the curated QA pair dataset" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "from ipynb.fs.defs.preprocess_data import store_documents\n", "from langchain.docstore.document import Document\n", "import json\n", "\n", "# Load QA dataset\n", "with open(\"./../input_data/QA_dataset/golden_qa_set.json\", 'r') as file:\n", " golden_qa_set = json.load(file)\n", "\n", "# Remove duplicate answers (Kersten + Secondary Literature) and template answers\n", "indices_to_remove = list(range(102, 121)) + list(range(122, 133)) + list(range(134, 157))\n", "indices_to_remove = sorted(set(indices_to_remove), reverse=True)\n", "for index in indices_to_remove:\n", " del golden_qa_set['qa_set'][index]\n", "\n", "question_set = [qa['question'] for qa in golden_qa_set['qa_set']]\n", "golden_answer_set = [qa['golden_answer'] for qa in golden_qa_set['qa_set']]" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# Create one document for each question\n", "all_qa_dataset_documents = []\n", "for q, a in zip(question_set, golden_answer_set):\n", "\n", " document = Document(\n", " page_content=f\"{q} \\n {a}\", \n", " metadata={\n", " \"source\": \"QA Dataset\",\n", " \"title\": \"QA Dataset\"\n", " })\n", " all_qa_dataset_documents.append(document)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "store_documents(all_qa_dataset_documents, \"./../input_data/QA_dataset/all_documents\")" ] } ], "metadata": { "kernelspec": { "display_name": "venv", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.8.5" } }, "nbformat": 4, "nbformat_minor": 2 }