Spaces:
Running
Running
File size: 2,301 Bytes
579ab0b |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 |
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Notebook for creating the documents based on the curated QA pair dataset"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"from ipynb.fs.defs.preprocess_data import store_documents\n",
"from langchain.docstore.document import Document\n",
"import json\n",
"\n",
"# Load QA dataset\n",
"with open(\"./../input_data/QA_dataset/golden_qa_set.json\", 'r') as file:\n",
" golden_qa_set = json.load(file)\n",
"\n",
"# Remove duplicate answers (Kersten + Secondary Literature) and template answers\n",
"indices_to_remove = list(range(102, 121)) + list(range(122, 133)) + list(range(134, 157))\n",
"indices_to_remove = sorted(set(indices_to_remove), reverse=True)\n",
"for index in indices_to_remove:\n",
" del golden_qa_set['qa_set'][index]\n",
"\n",
"question_set = [qa['question'] for qa in golden_qa_set['qa_set']]\n",
"golden_answer_set = [qa['golden_answer'] for qa in golden_qa_set['qa_set']]"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Create one document for each question\n",
"all_qa_dataset_documents = []\n",
"for q, a in zip(question_set, golden_answer_set):\n",
"\n",
" document = Document(\n",
" page_content=f\"{q} \\n {a}\", \n",
" metadata={\n",
" \"source\": \"QA Dataset\",\n",
" \"title\": \"QA Dataset\"\n",
" })\n",
" all_qa_dataset_documents.append(document)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"store_documents(all_qa_dataset_documents, \"./../input_data/QA_dataset/all_documents\")"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "venv",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.8.5"
}
},
"nbformat": 4,
"nbformat_minor": 2
}
|