Spaces:
Running
Running
Commit
·
9c37331
0
Parent(s):
initial commit
Browse files- .gitattributes +3 -0
- .gitignore +2 -0
- Notebooks/scratchpad.ipynb +141 -0
- Notebooks/tutorials/RAG_basic.ipynb +0 -0
- Notebooks/tutorials/ScholarBot.ipynb +0 -0
- README.md +7 -0
- app/app.py +63 -0
- configs/llm_producer.yaml +13 -0
- configs/llm_refiner.yaml +6 -0
- configs/pipeline.yaml +17 -0
- llm/__init__.py +0 -0
- llm/__pycache__/__init__.cpython-310.pyc +0 -0
- llm/__pycache__/answer_generator.cpython-310.pyc +0 -0
- llm/__pycache__/base_llm.cpython-310.pyc +0 -0
- llm/__pycache__/query_refiner.cpython-310.pyc +0 -0
- llm/answer_generator.py +38 -0
- llm/base_llm.py +12 -0
- llm/query_refiner.py +20 -0
- pyproject.toml +7 -0
- ragbot.egg-info/PKG-INFO +3 -0
- ragbot.egg-info/SOURCES.txt +15 -0
- ragbot.egg-info/dependency_links.txt +1 -0
- ragbot.egg-info/top_level.txt +2 -0
- src/__init__.py +0 -0
- src/__pycache__/__init__.cpython-310.pyc +0 -0
- src/__pycache__/embedding.cpython-310.pyc +0 -0
- src/__pycache__/pipeline.cpython-310.pyc +0 -0
- src/__pycache__/preprocess.cpython-310.pyc +0 -0
- src/__pycache__/utils.cpython-310.pyc +0 -0
- src/embedding.py +40 -0
- src/pipeline.py +150 -0
- src/preprocess.py +44 -0
- src/utils.py +61 -0
.gitattributes
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
*.pdf filter=lfs diff=lfs merge=lfs -text
|
| 2 |
+
*.sqlite3 filter=lfs diff=lfs merge=lfs -text
|
| 3 |
+
*.bin filter=lfs diff=lfs merge=lfs -text
|
.gitignore
ADDED
|
@@ -0,0 +1,2 @@
|
|
|
|
|
|
|
|
|
|
| 1 |
+
.env
|
| 2 |
+
data/
|
Notebooks/scratchpad.ipynb
ADDED
|
@@ -0,0 +1,141 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"cells": [
|
| 3 |
+
{
|
| 4 |
+
"cell_type": "markdown",
|
| 5 |
+
"id": "04cabe4c",
|
| 6 |
+
"metadata": {},
|
| 7 |
+
"source": [
|
| 8 |
+
"Uncommend and run if dependencies are not installed"
|
| 9 |
+
]
|
| 10 |
+
},
|
| 11 |
+
{
|
| 12 |
+
"cell_type": "code",
|
| 13 |
+
"execution_count": 1,
|
| 14 |
+
"id": "cc4d2b9b",
|
| 15 |
+
"metadata": {},
|
| 16 |
+
"outputs": [],
|
| 17 |
+
"source": [
|
| 18 |
+
"# !pip install -q pyyaml\n",
|
| 19 |
+
"# !pip install -q requests\n",
|
| 20 |
+
"# !pip install -q dotenv\n",
|
| 21 |
+
"# !pip install -qU langchain-community\n",
|
| 22 |
+
"# !pip install -q pypdf\n",
|
| 23 |
+
"# %pip install -qU langchain-groq\n",
|
| 24 |
+
"# !pip install -q chromadb\n",
|
| 25 |
+
"# !pip install -q sentence-transformers"
|
| 26 |
+
]
|
| 27 |
+
},
|
| 28 |
+
{
|
| 29 |
+
"cell_type": "code",
|
| 30 |
+
"execution_count": 1,
|
| 31 |
+
"id": "7cdfaebc",
|
| 32 |
+
"metadata": {},
|
| 33 |
+
"outputs": [],
|
| 34 |
+
"source": [
|
| 35 |
+
"import sys\n",
|
| 36 |
+
"import os\n",
|
| 37 |
+
"\n",
|
| 38 |
+
"project_root = os.path.abspath(\"..\") # adjust this depending on where your notebook lives\n",
|
| 39 |
+
"if project_root not in sys.path:\n",
|
| 40 |
+
" sys.path.insert(0, project_root)\n"
|
| 41 |
+
]
|
| 42 |
+
},
|
| 43 |
+
{
|
| 44 |
+
"cell_type": "code",
|
| 45 |
+
"execution_count": 2,
|
| 46 |
+
"id": "72e187e0",
|
| 47 |
+
"metadata": {},
|
| 48 |
+
"outputs": [],
|
| 49 |
+
"source": [
|
| 50 |
+
"from src.pipeline import ChatPipeline"
|
| 51 |
+
]
|
| 52 |
+
},
|
| 53 |
+
{
|
| 54 |
+
"cell_type": "code",
|
| 55 |
+
"execution_count": 3,
|
| 56 |
+
"id": "f79416f1",
|
| 57 |
+
"metadata": {},
|
| 58 |
+
"outputs": [],
|
| 59 |
+
"source": [
|
| 60 |
+
"from src.utils import load_config"
|
| 61 |
+
]
|
| 62 |
+
},
|
| 63 |
+
{
|
| 64 |
+
"cell_type": "code",
|
| 65 |
+
"execution_count": 4,
|
| 66 |
+
"id": "ba557b13",
|
| 67 |
+
"metadata": {},
|
| 68 |
+
"outputs": [],
|
| 69 |
+
"source": [
|
| 70 |
+
"cp = ChatPipeline()"
|
| 71 |
+
]
|
| 72 |
+
},
|
| 73 |
+
{
|
| 74 |
+
"cell_type": "code",
|
| 75 |
+
"execution_count": 5,
|
| 76 |
+
"id": "49dc2580",
|
| 77 |
+
"metadata": {},
|
| 78 |
+
"outputs": [
|
| 79 |
+
{
|
| 80 |
+
"name": "stderr",
|
| 81 |
+
"output_type": "stream",
|
| 82 |
+
"text": [
|
| 83 |
+
"d:\\Thesis\\Vinayak Rana\\LLM\\RAG\\src\\embedding.py:16: LangChainDeprecationWarning: The class `HuggingFaceEmbeddings` was deprecated in LangChain 0.2.2 and will be removed in 1.0. An updated version of the class exists in the :class:`~langchain-huggingface package and should be used instead. To use it run `pip install -U :class:`~langchain-huggingface` and import as `from :class:`~langchain_huggingface import HuggingFaceEmbeddings``.\n",
|
| 84 |
+
" return HuggingFaceEmbeddings(model_name=self.model_name)\n",
|
| 85 |
+
"c:\\Users\\vinny\\Miniconda3\\envs\\scholarchatbot\\lib\\site-packages\\tqdm\\auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n",
|
| 86 |
+
" from .autonotebook import tqdm as notebook_tqdm\n",
|
| 87 |
+
"d:\\Thesis\\Vinayak Rana\\LLM\\RAG\\src\\pipeline.py:79: LangChainDeprecationWarning: Since Chroma 0.4.x the manual persistence method is no longer supported as docs are automatically persisted.\n",
|
| 88 |
+
" vector_store.persist()\n",
|
| 89 |
+
"d:\\Thesis\\Vinayak Rana\\LLM\\RAG\\llm\\answer_generator.py:23: LangChainDeprecationWarning: Please see the migration guide at: https://python.langchain.com/docs/versions/migrating_memory/\n",
|
| 90 |
+
" self.memory = ConversationBufferWindowMemory(\n"
|
| 91 |
+
]
|
| 92 |
+
}
|
| 93 |
+
],
|
| 94 |
+
"source": [
|
| 95 |
+
"cp.setup(arxiv_id=\"2407.05040\")"
|
| 96 |
+
]
|
| 97 |
+
},
|
| 98 |
+
{
|
| 99 |
+
"cell_type": "code",
|
| 100 |
+
"execution_count": 7,
|
| 101 |
+
"id": "ca77354b",
|
| 102 |
+
"metadata": {},
|
| 103 |
+
"outputs": [
|
| 104 |
+
{
|
| 105 |
+
"data": {
|
| 106 |
+
"text/plain": [
|
| 107 |
+
"'Based on the provided context, here\\'s a differentiation between Self-Instruct, Evol-Instruct, and OSSInstruct:\\n\\n1. **Self-Instruct**: This technique is used to align language models with self-generated instructions. It involves generating instruction-following data points through the Self-Instruct technique, which is utilized in Codealpaca and CodeLlama. The Self-Instruct technique is described in the paper \"Self-instruct: Aligning language models with self-generated instructions\" by Yizhong Wang et al. (2022).\\n\\n2. **Evol-Instruct**: This technique is used to evolve instruction-following data in both depth and breadth dimensions. It is employed in Wizardcoder to further evolve the Codealpaca dataset. The Evol-Instruct method is described in the paper \"EvolInstruct\" by Can Xu et al. (2023a).\\n\\n3. **OSSInstruct**: This technique is used to create instruction-following data from unlabeled open-source code snippets. It is employed in Magicoder to construct a method. The OSSInstruct technique is not described in detail in the provided context, but it is mentioned as a distinct method used in Magicoder.\\n\\nIn summary, Self-Instruct generates instruction-following data points, Evol-Instruct evolves instruction-following data, and OSSInstruct creates instruction-following data from open-source code snippets.'"
|
| 108 |
+
]
|
| 109 |
+
},
|
| 110 |
+
"execution_count": 7,
|
| 111 |
+
"metadata": {},
|
| 112 |
+
"output_type": "execute_result"
|
| 113 |
+
}
|
| 114 |
+
],
|
| 115 |
+
"source": [
|
| 116 |
+
"cp.query(\"can you differentiate between self instruct , evol instruct and OSS ?\")"
|
| 117 |
+
]
|
| 118 |
+
}
|
| 119 |
+
],
|
| 120 |
+
"metadata": {
|
| 121 |
+
"kernelspec": {
|
| 122 |
+
"display_name": "scholarchatbot",
|
| 123 |
+
"language": "python",
|
| 124 |
+
"name": "python3"
|
| 125 |
+
},
|
| 126 |
+
"language_info": {
|
| 127 |
+
"codemirror_mode": {
|
| 128 |
+
"name": "ipython",
|
| 129 |
+
"version": 3
|
| 130 |
+
},
|
| 131 |
+
"file_extension": ".py",
|
| 132 |
+
"mimetype": "text/x-python",
|
| 133 |
+
"name": "python",
|
| 134 |
+
"nbconvert_exporter": "python",
|
| 135 |
+
"pygments_lexer": "ipython3",
|
| 136 |
+
"version": "3.10.18"
|
| 137 |
+
}
|
| 138 |
+
},
|
| 139 |
+
"nbformat": 4,
|
| 140 |
+
"nbformat_minor": 5
|
| 141 |
+
}
|
Notebooks/tutorials/RAG_basic.ipynb
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
Notebooks/tutorials/ScholarBot.ipynb
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
README.md
ADDED
|
@@ -0,0 +1,7 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# RAG
|
| 2 |
+
|
| 3 |
+
## What should be the input format?
|
| 4 |
+
* Extract text from pdf?
|
| 5 |
+
* Use .tex file in submission?
|
| 6 |
+
##### The latex format might not be very good for the LLM (specially when it is a smaller LLM). So, extracting text from pdf would be better.
|
| 7 |
+
we could bring it latex if the output doesn't seem to be good enough. Or we are missing out the mathematical equations.
|
app/app.py
ADDED
|
@@ -0,0 +1,63 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import os
|
| 2 |
+
import tempfile
|
| 3 |
+
import streamlit as st
|
| 4 |
+
import sys
|
| 5 |
+
import os
|
| 6 |
+
from dotenv import load_dotenv
|
| 7 |
+
load_dotenv()
|
| 8 |
+
|
| 9 |
+
project_root = os.path.abspath(os.path.join(os.path.dirname(__file__), "..")) # dirty fix ----> can fix this with pip install -e .
|
| 10 |
+
if project_root not in sys.path:
|
| 11 |
+
sys.path.insert(0, project_root)
|
| 12 |
+
|
| 13 |
+
from src.pipeline import ChatPipeline
|
| 14 |
+
|
| 15 |
+
st.set_page_config(page_title="ScholarBot", layout="wide")
|
| 16 |
+
st.title("ScholarBot: Chat with Research Papers")
|
| 17 |
+
|
| 18 |
+
if "chat_pipeline" not in st.session_state:
|
| 19 |
+
st.session_state.chat_pipeline = None
|
| 20 |
+
if "chat_history" not in st.session_state:
|
| 21 |
+
st.session_state.chat_history = []
|
| 22 |
+
|
| 23 |
+
st.sidebar.header("Input Paper")
|
| 24 |
+
input_method = st.sidebar.radio("Choose input method:", ("Upload PDF", "arXiv ID"))
|
| 25 |
+
|
| 26 |
+
refine_query = st.sidebar.checkbox("Refine query before answering?", value=True)
|
| 27 |
+
|
| 28 |
+
if input_method == "Upload PDF":
|
| 29 |
+
uploaded_file = st.sidebar.file_uploader("Upload a PDF file", type=["pdf"])
|
| 30 |
+
if uploaded_file is not None:
|
| 31 |
+
with tempfile.NamedTemporaryFile(delete=False, suffix=".pdf") as tmp_file:
|
| 32 |
+
tmp_file.write(uploaded_file.read())
|
| 33 |
+
pdf_path = tmp_file.name
|
| 34 |
+
|
| 35 |
+
st.info("Setting up ScholarBot...")
|
| 36 |
+
st.session_state.chat_pipeline = ChatPipeline()
|
| 37 |
+
st.session_state.chat_pipeline.setup_from_pdf(pdf_path)
|
| 38 |
+
st.success("PDF loaded and indexed successfully!")
|
| 39 |
+
else:
|
| 40 |
+
arxiv_id = st.sidebar.text_input("Enter arXiv ID:")
|
| 41 |
+
if st.sidebar.button("Load Paper") and arxiv_id:
|
| 42 |
+
st.info("Setting up ScholarBot...")
|
| 43 |
+
st.session_state.chat_pipeline = ChatPipeline()
|
| 44 |
+
st.session_state.chat_pipeline.setup(arxiv_id=arxiv_id)
|
| 45 |
+
st.success(f"arXiv paper {arxiv_id} loaded successfully!")
|
| 46 |
+
|
| 47 |
+
st.subheader("Chat with the Paper")
|
| 48 |
+
user_input = st.text_input("Ask a question:", placeholder="e.g. What is the JointMI acquisition function?")
|
| 49 |
+
|
| 50 |
+
if st.button("Generate Answer") and user_input:
|
| 51 |
+
if st.session_state.chat_pipeline:
|
| 52 |
+
answer = st.session_state.chat_pipeline.query(user_input, refine_query=refine_query)
|
| 53 |
+
st.session_state.chat_history.append((user_input, answer))
|
| 54 |
+
else:
|
| 55 |
+
st.warning("Please load a paper first.")
|
| 56 |
+
|
| 57 |
+
if st.session_state.chat_history:
|
| 58 |
+
st.markdown("---")
|
| 59 |
+
st.subheader("📜 Chat History")
|
| 60 |
+
for q, a in st.session_state.chat_history[::-1]:
|
| 61 |
+
st.markdown(f"**You:** {q}")
|
| 62 |
+
st.markdown(f"**ScholarBot:** {a}")
|
| 63 |
+
st.markdown("---")
|
configs/llm_producer.yaml
ADDED
|
@@ -0,0 +1,13 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
model_name: llama-3.1-8b-instant
|
| 2 |
+
temperature: 0.2
|
| 3 |
+
max_tokens: 512
|
| 4 |
+
memory_window: 3
|
| 5 |
+
prompt_template: |
|
| 6 |
+
You are a helpful research assistant. Use the context below to answer the question.
|
| 7 |
+
If the answer is not in the context, say "I don't know."
|
| 8 |
+
|
| 9 |
+
Context:
|
| 10 |
+
{context}
|
| 11 |
+
|
| 12 |
+
Question:
|
| 13 |
+
{question}
|
configs/llm_refiner.yaml
ADDED
|
@@ -0,0 +1,6 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
model_name: llama-3.1-8b-instant
|
| 2 |
+
temperature: 0.3
|
| 3 |
+
max_tokens: 100
|
| 4 |
+
system_prompt: |
|
| 5 |
+
You are a query refining assistant. Improve the user's question to be more specific, clear, and relevant for a technical document search.
|
| 6 |
+
Preserve the original meaning. Avoid adding new facts. Use formal language if needed.
|
configs/pipeline.yaml
ADDED
|
@@ -0,0 +1,17 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
storage:
|
| 2 |
+
save_pdf_path: ./data/pdfs
|
| 3 |
+
persist_vector_db: True
|
| 4 |
+
vector_db_path: ./data/vector_db
|
| 5 |
+
|
| 6 |
+
text_splitter:
|
| 7 |
+
chunk_size: 1000
|
| 8 |
+
chunk_overlap: 200
|
| 9 |
+
|
| 10 |
+
embedding:
|
| 11 |
+
model_name: all-MiniLM-L6-v2
|
| 12 |
+
model_type: huggingface
|
| 13 |
+
|
| 14 |
+
vector_db:
|
| 15 |
+
path: "./data/vector_db/chroma_db"
|
| 16 |
+
search_kwargs:
|
| 17 |
+
"k": 3
|
llm/__init__.py
ADDED
|
File without changes
|
llm/__pycache__/__init__.cpython-310.pyc
ADDED
|
Binary file (139 Bytes). View file
|
|
|
llm/__pycache__/answer_generator.cpython-310.pyc
ADDED
|
Binary file (1.5 kB). View file
|
|
|
llm/__pycache__/base_llm.cpython-310.pyc
ADDED
|
Binary file (666 Bytes). View file
|
|
|
llm/__pycache__/query_refiner.cpython-310.pyc
ADDED
|
Binary file (1.03 kB). View file
|
|
|
llm/answer_generator.py
ADDED
|
@@ -0,0 +1,38 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from llm.base_llm import BaseLLM
|
| 2 |
+
from src.utils import load_config
|
| 3 |
+
from langchain_groq import ChatGroq
|
| 4 |
+
from langchain.prompts import PromptTemplate
|
| 5 |
+
from langchain.memory import ConversationBufferWindowMemory
|
| 6 |
+
from langchain.chains import ConversationalRetrievalChain
|
| 7 |
+
|
| 8 |
+
class GroqAnswerGenerator(BaseLLM):
|
| 9 |
+
def __init__(self, model_name: str, temperature: float, max_tokens: int, retriever=None):
|
| 10 |
+
|
| 11 |
+
self.retriever = retriever
|
| 12 |
+
self.config = load_config("./configs/llm_producer.yaml")
|
| 13 |
+
self.model = ChatGroq(
|
| 14 |
+
model=model_name,
|
| 15 |
+
temperature=temperature,
|
| 16 |
+
max_tokens=max_tokens
|
| 17 |
+
)
|
| 18 |
+
|
| 19 |
+
self.prompt_template = PromptTemplate.from_template(
|
| 20 |
+
self.config["prompt_template"]
|
| 21 |
+
)
|
| 22 |
+
|
| 23 |
+
self.memory = ConversationBufferWindowMemory(
|
| 24 |
+
memory_key="chat_history", # required by ConversationalRetrievalChain
|
| 25 |
+
return_messages=True,
|
| 26 |
+
k=self.config["memory_window"],
|
| 27 |
+
)
|
| 28 |
+
self.qa_chain = ConversationalRetrievalChain.from_llm(
|
| 29 |
+
llm=self.model,
|
| 30 |
+
retriever=self.retriever,
|
| 31 |
+
memory=self.memory,
|
| 32 |
+
chain_type="stuff",
|
| 33 |
+
combine_docs_chain_kwargs={
|
| 34 |
+
"prompt": self.prompt_template}
|
| 35 |
+
)
|
| 36 |
+
|
| 37 |
+
def generate_answer(self, prompt: str):
|
| 38 |
+
return self.qa_chain.run(question=prompt)
|
llm/base_llm.py
ADDED
|
@@ -0,0 +1,12 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from abc import abstractmethod
|
| 2 |
+
|
| 3 |
+
class BaseLLM:
|
| 4 |
+
"""
|
| 5 |
+
Base class for all LLMs (Large Language Models).
|
| 6 |
+
"""
|
| 7 |
+
@abstractmethod
|
| 8 |
+
def generate_answer(self, question: str):
|
| 9 |
+
"""
|
| 10 |
+
This is an abstract method that must be implemented by subclasses.
|
| 11 |
+
"""
|
| 12 |
+
pass
|
llm/query_refiner.py
ADDED
|
@@ -0,0 +1,20 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from langchain_groq import ChatGroq
|
| 2 |
+
from src.utils import load_config
|
| 3 |
+
from langchain.prompts import ChatPromptTemplate
|
| 4 |
+
|
| 5 |
+
class QueryRefiner:
|
| 6 |
+
def __init__(self):
|
| 7 |
+
config = load_config("./configs/llm_refiner.yaml")
|
| 8 |
+
self.model = ChatGroq(
|
| 9 |
+
model=config["model_name"],
|
| 10 |
+
temperature=config["temperature"],
|
| 11 |
+
max_tokens=config["max_tokens"]
|
| 12 |
+
)
|
| 13 |
+
self.prompt = ChatPromptTemplate.from_messages([
|
| 14 |
+
("system", config["system_prompt"]),
|
| 15 |
+
("human", "{query}")
|
| 16 |
+
])
|
| 17 |
+
|
| 18 |
+
def refine(self, query: str):
|
| 19 |
+
chain = self.prompt | self.model
|
| 20 |
+
return chain.invoke({"query": query}).content
|
pyproject.toml
ADDED
|
@@ -0,0 +1,7 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
[project]
|
| 2 |
+
name = "ragbot"
|
| 3 |
+
version = "0.1.0"
|
| 4 |
+
dependencies = []
|
| 5 |
+
|
| 6 |
+
[tool.setuptools]
|
| 7 |
+
packages = ["src", "llm"]
|
ragbot.egg-info/PKG-INFO
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
Metadata-Version: 2.4
|
| 2 |
+
Name: ragbot
|
| 3 |
+
Version: 0.1.0
|
ragbot.egg-info/SOURCES.txt
ADDED
|
@@ -0,0 +1,15 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
README.md
|
| 2 |
+
pyproject.toml
|
| 3 |
+
llm/__init__.py
|
| 4 |
+
llm/answer_generator.py
|
| 5 |
+
llm/base_llm.py
|
| 6 |
+
llm/query_refiner.py
|
| 7 |
+
ragbot.egg-info/PKG-INFO
|
| 8 |
+
ragbot.egg-info/SOURCES.txt
|
| 9 |
+
ragbot.egg-info/dependency_links.txt
|
| 10 |
+
ragbot.egg-info/top_level.txt
|
| 11 |
+
src/__init__.py
|
| 12 |
+
src/embedding.py
|
| 13 |
+
src/pipeline.py
|
| 14 |
+
src/preprocess.py
|
| 15 |
+
src/utils.py
|
ragbot.egg-info/dependency_links.txt
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
|
ragbot.egg-info/top_level.txt
ADDED
|
@@ -0,0 +1,2 @@
|
|
|
|
|
|
|
|
|
|
| 1 |
+
llm
|
| 2 |
+
src
|
src/__init__.py
ADDED
|
File without changes
|
src/__pycache__/__init__.cpython-310.pyc
ADDED
|
Binary file (139 Bytes). View file
|
|
|
src/__pycache__/embedding.cpython-310.pyc
ADDED
|
Binary file (1.75 kB). View file
|
|
|
src/__pycache__/pipeline.cpython-310.pyc
ADDED
|
Binary file (4.75 kB). View file
|
|
|
src/__pycache__/preprocess.cpython-310.pyc
ADDED
|
Binary file (1.26 kB). View file
|
|
|
src/__pycache__/utils.cpython-310.pyc
ADDED
|
Binary file (1.85 kB). View file
|
|
|
src/embedding.py
ADDED
|
@@ -0,0 +1,40 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from typing import Union, List
|
| 2 |
+
from langchain.embeddings import HuggingFaceEmbeddings
|
| 3 |
+
|
| 4 |
+
class EmbeddingModel:
|
| 5 |
+
"""
|
| 6 |
+
A flexible embedding model wrapper supporting multiple backend models.
|
| 7 |
+
"""
|
| 8 |
+
|
| 9 |
+
def __init__(self, model_type: str = "huggingface", model_name: str = "all-MiniLM-L6-v2"):
|
| 10 |
+
self.model_type = model_type
|
| 11 |
+
self.model_name = model_name
|
| 12 |
+
self.model = self._load_model()
|
| 13 |
+
|
| 14 |
+
def _load_model(self):
|
| 15 |
+
if self.model_type == "huggingface":
|
| 16 |
+
return HuggingFaceEmbeddings(model_name=self.model_name)
|
| 17 |
+
|
| 18 |
+
# Implementation for other model types can be added here
|
| 19 |
+
|
| 20 |
+
else:
|
| 21 |
+
raise ValueError(f"Unsupported model type: {self.model_type}")
|
| 22 |
+
|
| 23 |
+
def embed(self, text: Union[str, List[str]]):
|
| 24 |
+
"""
|
| 25 |
+
Generate embeddings for the given text.
|
| 26 |
+
|
| 27 |
+
:param text: A string or list of strings.
|
| 28 |
+
:return: A list of embeddings.
|
| 29 |
+
"""
|
| 30 |
+
if self.model_type == "huggingface":
|
| 31 |
+
|
| 32 |
+
if isinstance(text, list):
|
| 33 |
+
return [self.model.embed_query(t) for t in text]
|
| 34 |
+
return self.model.embed_query(text)
|
| 35 |
+
|
| 36 |
+
elif self.model_type == "sentence_transformers":
|
| 37 |
+
return self.model.encode(text, convert_to_tensor=True).tolist()
|
| 38 |
+
|
| 39 |
+
else:
|
| 40 |
+
raise NotImplementedError(f"Embedding for {self.model_type} is not implemented.")
|
src/pipeline.py
ADDED
|
@@ -0,0 +1,150 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from src.utils import load_config
|
| 2 |
+
from dotenv import load_dotenv
|
| 3 |
+
from src.utils import get_pdf_from_url
|
| 4 |
+
from src.preprocess import Preprocessor
|
| 5 |
+
from src.embedding import EmbeddingModel
|
| 6 |
+
from src.utils import extract_text_from_pdf
|
| 7 |
+
from langchain.vectorstores import Chroma
|
| 8 |
+
from llm.answer_generator import GroqAnswerGenerator
|
| 9 |
+
from langchain.text_splitter import RecursiveCharacterTextSplitter
|
| 10 |
+
from llm.query_refiner import QueryRefiner
|
| 11 |
+
|
| 12 |
+
|
| 13 |
+
class ChatPipeline:
|
| 14 |
+
def __init__(self, arxiv_id:str=None):
|
| 15 |
+
|
| 16 |
+
self.arxiv_id = None
|
| 17 |
+
self.config = load_config()
|
| 18 |
+
self.chatbot_config = load_config("./configs/llm_producer.yaml")
|
| 19 |
+
self.chunks = None
|
| 20 |
+
self.retriever = None
|
| 21 |
+
|
| 22 |
+
def _preprocess_docs(self, docs):
|
| 23 |
+
"""
|
| 24 |
+
Preprocess the input text using the Preprocessor class.
|
| 25 |
+
|
| 26 |
+
Args:
|
| 27 |
+
text (str): The text to preprocess.
|
| 28 |
+
|
| 29 |
+
Returns:
|
| 30 |
+
str: The preprocessed text.
|
| 31 |
+
"""
|
| 32 |
+
if not docs:
|
| 33 |
+
raise ValueError("No documents provided for preprocessing.")
|
| 34 |
+
if not isinstance(docs, list):
|
| 35 |
+
raise TypeError("Expected a list of documents for preprocessing.")
|
| 36 |
+
if not all(hasattr(doc, 'page_content') for doc in docs):
|
| 37 |
+
raise ValueError("All documents must have a 'page_content' attribute.")
|
| 38 |
+
|
| 39 |
+
|
| 40 |
+
preprocessor = Preprocessor()
|
| 41 |
+
|
| 42 |
+
for i, doc in enumerate(docs):
|
| 43 |
+
doc.page_content = preprocessor(doc.page_content)
|
| 44 |
+
return docs
|
| 45 |
+
|
| 46 |
+
def _create_chunks(self, docs):
|
| 47 |
+
"""
|
| 48 |
+
Create chunks from the preprocessed documents.
|
| 49 |
+
|
| 50 |
+
Args:
|
| 51 |
+
docs (list): List of preprocessed documents.
|
| 52 |
+
|
| 53 |
+
Returns:
|
| 54 |
+
list: List of document chunks.
|
| 55 |
+
"""
|
| 56 |
+
|
| 57 |
+
text_splitter = RecursiveCharacterTextSplitter(
|
| 58 |
+
chunk_size=self.config["text_splitter"]["chunk_size"],
|
| 59 |
+
chunk_overlap=self.config["text_splitter"]["chunk_overlap"]
|
| 60 |
+
)
|
| 61 |
+
|
| 62 |
+
return text_splitter.split_documents(docs)
|
| 63 |
+
|
| 64 |
+
def _create_vector_store(self, chunks):
|
| 65 |
+
"""
|
| 66 |
+
Create a vector store from the document chunks.
|
| 67 |
+
|
| 68 |
+
Args:
|
| 69 |
+
chunks (list): List of document chunks.
|
| 70 |
+
|
| 71 |
+
Returns:
|
| 72 |
+
VectorStore: The created vector store.
|
| 73 |
+
"""
|
| 74 |
+
embedding_model = EmbeddingModel(model_type=self.config['embedding']['model_type'],
|
| 75 |
+
model_name=self.config['embedding']['model_name'])
|
| 76 |
+
vector_store = Chroma.from_documents(
|
| 77 |
+
documents=chunks,
|
| 78 |
+
embedding=embedding_model.model,
|
| 79 |
+
persist_directory=self.config['vector_db']['path']
|
| 80 |
+
)
|
| 81 |
+
vector_store.persist()
|
| 82 |
+
self.retriever = vector_store.as_retriever(search_kwargs=self.config['vector_db']['search_kwargs'])
|
| 83 |
+
|
| 84 |
+
def setup(self, arxiv_id:str):
|
| 85 |
+
"""
|
| 86 |
+
Setup the pipeline by loading necessary configurations and resources.
|
| 87 |
+
"""
|
| 88 |
+
self.arxiv_id = arxiv_id
|
| 89 |
+
if not self.arxiv_id:
|
| 90 |
+
raise ValueError("arxiv_id must be provided to setup the pipeline.")
|
| 91 |
+
|
| 92 |
+
self.query_refiner = QueryRefiner()
|
| 93 |
+
|
| 94 |
+
get_pdf_from_url(self.arxiv_id, self.config['storage']['save_pdf_path'])
|
| 95 |
+
|
| 96 |
+
documents = extract_text_from_pdf(f"{self.config['storage']['save_pdf_path']}/{self.arxiv_id}.pdf")
|
| 97 |
+
|
| 98 |
+
preprocessed_docs = self._preprocess_docs(documents)
|
| 99 |
+
|
| 100 |
+
self.chunks = self._create_chunks(preprocessed_docs)
|
| 101 |
+
|
| 102 |
+
self._create_vector_store(self.chunks)
|
| 103 |
+
|
| 104 |
+
self.chatbot = GroqAnswerGenerator(
|
| 105 |
+
model_name=self.chatbot_config['model_name'],
|
| 106 |
+
temperature=self.chatbot_config['temperature'],
|
| 107 |
+
max_tokens=self.chatbot_config['max_tokens'],
|
| 108 |
+
retriever=self.retriever
|
| 109 |
+
)
|
| 110 |
+
|
| 111 |
+
def setup_from_pdf(self, pdf_path: str):
|
| 112 |
+
"""
|
| 113 |
+
Setup the pipeline using a local PDF file.
|
| 114 |
+
"""
|
| 115 |
+
if not pdf_path:
|
| 116 |
+
raise ValueError("pdf_path must be provided to setup the pipeline.")
|
| 117 |
+
|
| 118 |
+
documents = extract_text_from_pdf(pdf_path)
|
| 119 |
+
|
| 120 |
+
preprocessed_docs = self._preprocess_docs(documents)
|
| 121 |
+
|
| 122 |
+
self.chunks = self._create_chunks(preprocessed_docs)
|
| 123 |
+
|
| 124 |
+
self._create_vector_store(self.chunks)
|
| 125 |
+
|
| 126 |
+
self.chatbot = GroqAnswerGenerator(
|
| 127 |
+
model_name=self.chatbot_config['model_name'],
|
| 128 |
+
temperature=self.chatbot_config['temperature'],
|
| 129 |
+
max_tokens=self.chatbot_config['max_tokens'],
|
| 130 |
+
retriever=self.retriever
|
| 131 |
+
)
|
| 132 |
+
|
| 133 |
+
def query(self, prompt: str, refine_query: bool = True):
|
| 134 |
+
"""
|
| 135 |
+
Query the chatbot with a prompt.
|
| 136 |
+
|
| 137 |
+
Args:
|
| 138 |
+
prompt (str): The prompt to query the chatbot with.
|
| 139 |
+
|
| 140 |
+
Returns:
|
| 141 |
+
str: The response from the chatbot.
|
| 142 |
+
"""
|
| 143 |
+
if not self.chatbot:
|
| 144 |
+
raise ValueError("Chatbot is not initialized. Call setup() method first.")
|
| 145 |
+
|
| 146 |
+
if refine_query:
|
| 147 |
+
refined_query = self.query_refiner.refine(prompt)
|
| 148 |
+
return self.chatbot.generate_answer(refined_query)
|
| 149 |
+
else:
|
| 150 |
+
return self.chatbot.generate_answer(prompt)
|
src/preprocess.py
ADDED
|
@@ -0,0 +1,44 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import re
|
| 2 |
+
|
| 3 |
+
class Preprocessor:
|
| 4 |
+
"""
|
| 5 |
+
A class for preprocessing text data.
|
| 6 |
+
This class provides to clean and normalize text data.
|
| 7 |
+
"""
|
| 8 |
+
|
| 9 |
+
@staticmethod
|
| 10 |
+
def basic_preprocess(text):
|
| 11 |
+
"""
|
| 12 |
+
Basic preprocessing of text data.
|
| 13 |
+
- Converts to lowercase
|
| 14 |
+
- Removes special characters and digits
|
| 15 |
+
- Strips leading and trailing whitespace
|
| 16 |
+
"""
|
| 17 |
+
# Remove common strings like page numbers, arXiv mentions, etc.
|
| 18 |
+
text = re.sub(r'Page \d+|arXiv preprint.*|Copyright.*', '', text, flags=re.IGNORECASE)
|
| 19 |
+
|
| 20 |
+
# Merge single newlines within paragraphs, but keep double newlines as paragraph breaks
|
| 21 |
+
text = re.sub(r'(?<!\n)\n(?!\n)', ' ', text)
|
| 22 |
+
|
| 23 |
+
# Remove hyphenations at line breaks (like "exam-\nple" -> "example")
|
| 24 |
+
|
| 25 |
+
text = re.sub(r'-\s*\n', '', text)
|
| 26 |
+
|
| 27 |
+
# First remove newline after hyphen, then remove just the hyphen if it remains
|
| 28 |
+
text = re.sub(r'-\s+', '', text)
|
| 29 |
+
|
| 30 |
+
# Normalize extra spaces
|
| 31 |
+
text = re.sub(r'\s+', ' ', text)
|
| 32 |
+
|
| 33 |
+
# Strip leading/trailing whitespace
|
| 34 |
+
text = text.strip()
|
| 35 |
+
|
| 36 |
+
return text
|
| 37 |
+
|
| 38 |
+
def __call__(self, *args, **kwds):
|
| 39 |
+
"""
|
| 40 |
+
Call method to apply basic preprocessing.
|
| 41 |
+
This allows the class instance to be used as a function.
|
| 42 |
+
"""
|
| 43 |
+
return self.basic_preprocess(*args, **kwds)
|
| 44 |
+
|
src/utils.py
ADDED
|
@@ -0,0 +1,61 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import os
|
| 2 |
+
import yaml
|
| 3 |
+
import requests
|
| 4 |
+
from pathlib import Path
|
| 5 |
+
from langchain.document_loaders import PyPDFLoader
|
| 6 |
+
|
| 7 |
+
def get_pdf_from_url(arxiv_id: str, save_dir: str) -> str:
|
| 8 |
+
"""
|
| 9 |
+
Downloads a PDF from arXiv given an ID, unless already downloaded.
|
| 10 |
+
|
| 11 |
+
Returns:
|
| 12 |
+
str: Path to the downloaded (or existing) PDF.
|
| 13 |
+
"""
|
| 14 |
+
os.makedirs(save_dir, exist_ok=True)
|
| 15 |
+
pdf_path = os.path.join(save_dir, f"{arxiv_id}.pdf")
|
| 16 |
+
|
| 17 |
+
if os.path.exists(pdf_path):
|
| 18 |
+
# print(f"[cache] PDF already exists: {pdf_path}")
|
| 19 |
+
return pdf_path
|
| 20 |
+
|
| 21 |
+
url = f"https://arxiv.org/pdf/{arxiv_id}.pdf"
|
| 22 |
+
|
| 23 |
+
response = requests.get(url)
|
| 24 |
+
response.raise_for_status()
|
| 25 |
+
|
| 26 |
+
with open(pdf_path, "wb") as f:
|
| 27 |
+
f.write(response.content)
|
| 28 |
+
|
| 29 |
+
def load_config(config_path: str="./configs/pipeline.yaml") -> dict:
|
| 30 |
+
"""
|
| 31 |
+
Load a YAML configuration file and return its contents as a dictionary.
|
| 32 |
+
|
| 33 |
+
Args:
|
| 34 |
+
config_path (str): The path to the YAML configuration file.
|
| 35 |
+
|
| 36 |
+
Returns:
|
| 37 |
+
dict: The contents of the configuration file.
|
| 38 |
+
"""
|
| 39 |
+
config_path = Path(config_path)
|
| 40 |
+
if not config_path.exists():
|
| 41 |
+
raise FileNotFoundError(f"Configuration file {config_path} does not exist.")
|
| 42 |
+
|
| 43 |
+
with open(config_path, 'r') as file:
|
| 44 |
+
config = yaml.safe_load(file)
|
| 45 |
+
|
| 46 |
+
return config
|
| 47 |
+
|
| 48 |
+
def extract_text_from_pdf(pdf_path: str) -> str:
|
| 49 |
+
"""
|
| 50 |
+
Extract text from a PDF file.
|
| 51 |
+
|
| 52 |
+
Args:
|
| 53 |
+
pdf_path (str): The path to the PDF file.
|
| 54 |
+
|
| 55 |
+
Returns:
|
| 56 |
+
str: The extracted text from the PDF.
|
| 57 |
+
"""
|
| 58 |
+
loader = PyPDFLoader(pdf_path)
|
| 59 |
+
documents = loader.load()
|
| 60 |
+
|
| 61 |
+
return documents
|