vinny4 commited on
Commit
9c37331
·
0 Parent(s):

initial commit

Browse files
.gitattributes ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ *.pdf filter=lfs diff=lfs merge=lfs -text
2
+ *.sqlite3 filter=lfs diff=lfs merge=lfs -text
3
+ *.bin filter=lfs diff=lfs merge=lfs -text
.gitignore ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ .env
2
+ data/
Notebooks/scratchpad.ipynb ADDED
@@ -0,0 +1,141 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cells": [
3
+ {
4
+ "cell_type": "markdown",
5
+ "id": "04cabe4c",
6
+ "metadata": {},
7
+ "source": [
8
+ "Uncommend and run if dependencies are not installed"
9
+ ]
10
+ },
11
+ {
12
+ "cell_type": "code",
13
+ "execution_count": 1,
14
+ "id": "cc4d2b9b",
15
+ "metadata": {},
16
+ "outputs": [],
17
+ "source": [
18
+ "# !pip install -q pyyaml\n",
19
+ "# !pip install -q requests\n",
20
+ "# !pip install -q dotenv\n",
21
+ "# !pip install -qU langchain-community\n",
22
+ "# !pip install -q pypdf\n",
23
+ "# %pip install -qU langchain-groq\n",
24
+ "# !pip install -q chromadb\n",
25
+ "# !pip install -q sentence-transformers"
26
+ ]
27
+ },
28
+ {
29
+ "cell_type": "code",
30
+ "execution_count": 1,
31
+ "id": "7cdfaebc",
32
+ "metadata": {},
33
+ "outputs": [],
34
+ "source": [
35
+ "import sys\n",
36
+ "import os\n",
37
+ "\n",
38
+ "project_root = os.path.abspath(\"..\") # adjust this depending on where your notebook lives\n",
39
+ "if project_root not in sys.path:\n",
40
+ " sys.path.insert(0, project_root)\n"
41
+ ]
42
+ },
43
+ {
44
+ "cell_type": "code",
45
+ "execution_count": 2,
46
+ "id": "72e187e0",
47
+ "metadata": {},
48
+ "outputs": [],
49
+ "source": [
50
+ "from src.pipeline import ChatPipeline"
51
+ ]
52
+ },
53
+ {
54
+ "cell_type": "code",
55
+ "execution_count": 3,
56
+ "id": "f79416f1",
57
+ "metadata": {},
58
+ "outputs": [],
59
+ "source": [
60
+ "from src.utils import load_config"
61
+ ]
62
+ },
63
+ {
64
+ "cell_type": "code",
65
+ "execution_count": 4,
66
+ "id": "ba557b13",
67
+ "metadata": {},
68
+ "outputs": [],
69
+ "source": [
70
+ "cp = ChatPipeline()"
71
+ ]
72
+ },
73
+ {
74
+ "cell_type": "code",
75
+ "execution_count": 5,
76
+ "id": "49dc2580",
77
+ "metadata": {},
78
+ "outputs": [
79
+ {
80
+ "name": "stderr",
81
+ "output_type": "stream",
82
+ "text": [
83
+ "d:\\Thesis\\Vinayak Rana\\LLM\\RAG\\src\\embedding.py:16: LangChainDeprecationWarning: The class `HuggingFaceEmbeddings` was deprecated in LangChain 0.2.2 and will be removed in 1.0. An updated version of the class exists in the :class:`~langchain-huggingface package and should be used instead. To use it run `pip install -U :class:`~langchain-huggingface` and import as `from :class:`~langchain_huggingface import HuggingFaceEmbeddings``.\n",
84
+ " return HuggingFaceEmbeddings(model_name=self.model_name)\n",
85
+ "c:\\Users\\vinny\\Miniconda3\\envs\\scholarchatbot\\lib\\site-packages\\tqdm\\auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n",
86
+ " from .autonotebook import tqdm as notebook_tqdm\n",
87
+ "d:\\Thesis\\Vinayak Rana\\LLM\\RAG\\src\\pipeline.py:79: LangChainDeprecationWarning: Since Chroma 0.4.x the manual persistence method is no longer supported as docs are automatically persisted.\n",
88
+ " vector_store.persist()\n",
89
+ "d:\\Thesis\\Vinayak Rana\\LLM\\RAG\\llm\\answer_generator.py:23: LangChainDeprecationWarning: Please see the migration guide at: https://python.langchain.com/docs/versions/migrating_memory/\n",
90
+ " self.memory = ConversationBufferWindowMemory(\n"
91
+ ]
92
+ }
93
+ ],
94
+ "source": [
95
+ "cp.setup(arxiv_id=\"2407.05040\")"
96
+ ]
97
+ },
98
+ {
99
+ "cell_type": "code",
100
+ "execution_count": 7,
101
+ "id": "ca77354b",
102
+ "metadata": {},
103
+ "outputs": [
104
+ {
105
+ "data": {
106
+ "text/plain": [
107
+ "'Based on the provided context, here\\'s a differentiation between Self-Instruct, Evol-Instruct, and OSSInstruct:\\n\\n1. **Self-Instruct**: This technique is used to align language models with self-generated instructions. It involves generating instruction-following data points through the Self-Instruct technique, which is utilized in Codealpaca and CodeLlama. The Self-Instruct technique is described in the paper \"Self-instruct: Aligning language models with self-generated instructions\" by Yizhong Wang et al. (2022).\\n\\n2. **Evol-Instruct**: This technique is used to evolve instruction-following data in both depth and breadth dimensions. It is employed in Wizardcoder to further evolve the Codealpaca dataset. The Evol-Instruct method is described in the paper \"EvolInstruct\" by Can Xu et al. (2023a).\\n\\n3. **OSSInstruct**: This technique is used to create instruction-following data from unlabeled open-source code snippets. It is employed in Magicoder to construct a method. The OSSInstruct technique is not described in detail in the provided context, but it is mentioned as a distinct method used in Magicoder.\\n\\nIn summary, Self-Instruct generates instruction-following data points, Evol-Instruct evolves instruction-following data, and OSSInstruct creates instruction-following data from open-source code snippets.'"
108
+ ]
109
+ },
110
+ "execution_count": 7,
111
+ "metadata": {},
112
+ "output_type": "execute_result"
113
+ }
114
+ ],
115
+ "source": [
116
+ "cp.query(\"can you differentiate between self instruct , evol instruct and OSS ?\")"
117
+ ]
118
+ }
119
+ ],
120
+ "metadata": {
121
+ "kernelspec": {
122
+ "display_name": "scholarchatbot",
123
+ "language": "python",
124
+ "name": "python3"
125
+ },
126
+ "language_info": {
127
+ "codemirror_mode": {
128
+ "name": "ipython",
129
+ "version": 3
130
+ },
131
+ "file_extension": ".py",
132
+ "mimetype": "text/x-python",
133
+ "name": "python",
134
+ "nbconvert_exporter": "python",
135
+ "pygments_lexer": "ipython3",
136
+ "version": "3.10.18"
137
+ }
138
+ },
139
+ "nbformat": 4,
140
+ "nbformat_minor": 5
141
+ }
Notebooks/tutorials/RAG_basic.ipynb ADDED
The diff for this file is too large to render. See raw diff
 
Notebooks/tutorials/ScholarBot.ipynb ADDED
The diff for this file is too large to render. See raw diff
 
README.md ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ # RAG
2
+
3
+ ## What should be the input format?
4
+ * Extract text from pdf?
5
+ * Use .tex file in submission?
6
+ ##### The latex format might not be very good for the LLM (specially when it is a smaller LLM). So, extracting text from pdf would be better.
7
+ we could bring it latex if the output doesn't seem to be good enough. Or we are missing out the mathematical equations.
app/app.py ADDED
@@ -0,0 +1,63 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import tempfile
3
+ import streamlit as st
4
+ import sys
5
+ import os
6
+ from dotenv import load_dotenv
7
+ load_dotenv()
8
+
9
+ project_root = os.path.abspath(os.path.join(os.path.dirname(__file__), "..")) # dirty fix ----> can fix this with pip install -e .
10
+ if project_root not in sys.path:
11
+ sys.path.insert(0, project_root)
12
+
13
+ from src.pipeline import ChatPipeline
14
+
15
+ st.set_page_config(page_title="ScholarBot", layout="wide")
16
+ st.title("ScholarBot: Chat with Research Papers")
17
+
18
+ if "chat_pipeline" not in st.session_state:
19
+ st.session_state.chat_pipeline = None
20
+ if "chat_history" not in st.session_state:
21
+ st.session_state.chat_history = []
22
+
23
+ st.sidebar.header("Input Paper")
24
+ input_method = st.sidebar.radio("Choose input method:", ("Upload PDF", "arXiv ID"))
25
+
26
+ refine_query = st.sidebar.checkbox("Refine query before answering?", value=True)
27
+
28
+ if input_method == "Upload PDF":
29
+ uploaded_file = st.sidebar.file_uploader("Upload a PDF file", type=["pdf"])
30
+ if uploaded_file is not None:
31
+ with tempfile.NamedTemporaryFile(delete=False, suffix=".pdf") as tmp_file:
32
+ tmp_file.write(uploaded_file.read())
33
+ pdf_path = tmp_file.name
34
+
35
+ st.info("Setting up ScholarBot...")
36
+ st.session_state.chat_pipeline = ChatPipeline()
37
+ st.session_state.chat_pipeline.setup_from_pdf(pdf_path)
38
+ st.success("PDF loaded and indexed successfully!")
39
+ else:
40
+ arxiv_id = st.sidebar.text_input("Enter arXiv ID:")
41
+ if st.sidebar.button("Load Paper") and arxiv_id:
42
+ st.info("Setting up ScholarBot...")
43
+ st.session_state.chat_pipeline = ChatPipeline()
44
+ st.session_state.chat_pipeline.setup(arxiv_id=arxiv_id)
45
+ st.success(f"arXiv paper {arxiv_id} loaded successfully!")
46
+
47
+ st.subheader("Chat with the Paper")
48
+ user_input = st.text_input("Ask a question:", placeholder="e.g. What is the JointMI acquisition function?")
49
+
50
+ if st.button("Generate Answer") and user_input:
51
+ if st.session_state.chat_pipeline:
52
+ answer = st.session_state.chat_pipeline.query(user_input, refine_query=refine_query)
53
+ st.session_state.chat_history.append((user_input, answer))
54
+ else:
55
+ st.warning("Please load a paper first.")
56
+
57
+ if st.session_state.chat_history:
58
+ st.markdown("---")
59
+ st.subheader("📜 Chat History")
60
+ for q, a in st.session_state.chat_history[::-1]:
61
+ st.markdown(f"**You:** {q}")
62
+ st.markdown(f"**ScholarBot:** {a}")
63
+ st.markdown("---")
configs/llm_producer.yaml ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ model_name: llama-3.1-8b-instant
2
+ temperature: 0.2
3
+ max_tokens: 512
4
+ memory_window: 3
5
+ prompt_template: |
6
+ You are a helpful research assistant. Use the context below to answer the question.
7
+ If the answer is not in the context, say "I don't know."
8
+
9
+ Context:
10
+ {context}
11
+
12
+ Question:
13
+ {question}
configs/llm_refiner.yaml ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ model_name: llama-3.1-8b-instant
2
+ temperature: 0.3
3
+ max_tokens: 100
4
+ system_prompt: |
5
+ You are a query refining assistant. Improve the user's question to be more specific, clear, and relevant for a technical document search.
6
+ Preserve the original meaning. Avoid adding new facts. Use formal language if needed.
configs/pipeline.yaml ADDED
@@ -0,0 +1,17 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ storage:
2
+ save_pdf_path: ./data/pdfs
3
+ persist_vector_db: True
4
+ vector_db_path: ./data/vector_db
5
+
6
+ text_splitter:
7
+ chunk_size: 1000
8
+ chunk_overlap: 200
9
+
10
+ embedding:
11
+ model_name: all-MiniLM-L6-v2
12
+ model_type: huggingface
13
+
14
+ vector_db:
15
+ path: "./data/vector_db/chroma_db"
16
+ search_kwargs:
17
+ "k": 3
llm/__init__.py ADDED
File without changes
llm/__pycache__/__init__.cpython-310.pyc ADDED
Binary file (139 Bytes). View file
 
llm/__pycache__/answer_generator.cpython-310.pyc ADDED
Binary file (1.5 kB). View file
 
llm/__pycache__/base_llm.cpython-310.pyc ADDED
Binary file (666 Bytes). View file
 
llm/__pycache__/query_refiner.cpython-310.pyc ADDED
Binary file (1.03 kB). View file
 
llm/answer_generator.py ADDED
@@ -0,0 +1,38 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from llm.base_llm import BaseLLM
2
+ from src.utils import load_config
3
+ from langchain_groq import ChatGroq
4
+ from langchain.prompts import PromptTemplate
5
+ from langchain.memory import ConversationBufferWindowMemory
6
+ from langchain.chains import ConversationalRetrievalChain
7
+
8
+ class GroqAnswerGenerator(BaseLLM):
9
+ def __init__(self, model_name: str, temperature: float, max_tokens: int, retriever=None):
10
+
11
+ self.retriever = retriever
12
+ self.config = load_config("./configs/llm_producer.yaml")
13
+ self.model = ChatGroq(
14
+ model=model_name,
15
+ temperature=temperature,
16
+ max_tokens=max_tokens
17
+ )
18
+
19
+ self.prompt_template = PromptTemplate.from_template(
20
+ self.config["prompt_template"]
21
+ )
22
+
23
+ self.memory = ConversationBufferWindowMemory(
24
+ memory_key="chat_history", # required by ConversationalRetrievalChain
25
+ return_messages=True,
26
+ k=self.config["memory_window"],
27
+ )
28
+ self.qa_chain = ConversationalRetrievalChain.from_llm(
29
+ llm=self.model,
30
+ retriever=self.retriever,
31
+ memory=self.memory,
32
+ chain_type="stuff",
33
+ combine_docs_chain_kwargs={
34
+ "prompt": self.prompt_template}
35
+ )
36
+
37
+ def generate_answer(self, prompt: str):
38
+ return self.qa_chain.run(question=prompt)
llm/base_llm.py ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from abc import abstractmethod
2
+
3
+ class BaseLLM:
4
+ """
5
+ Base class for all LLMs (Large Language Models).
6
+ """
7
+ @abstractmethod
8
+ def generate_answer(self, question: str):
9
+ """
10
+ This is an abstract method that must be implemented by subclasses.
11
+ """
12
+ pass
llm/query_refiner.py ADDED
@@ -0,0 +1,20 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from langchain_groq import ChatGroq
2
+ from src.utils import load_config
3
+ from langchain.prompts import ChatPromptTemplate
4
+
5
+ class QueryRefiner:
6
+ def __init__(self):
7
+ config = load_config("./configs/llm_refiner.yaml")
8
+ self.model = ChatGroq(
9
+ model=config["model_name"],
10
+ temperature=config["temperature"],
11
+ max_tokens=config["max_tokens"]
12
+ )
13
+ self.prompt = ChatPromptTemplate.from_messages([
14
+ ("system", config["system_prompt"]),
15
+ ("human", "{query}")
16
+ ])
17
+
18
+ def refine(self, query: str):
19
+ chain = self.prompt | self.model
20
+ return chain.invoke({"query": query}).content
pyproject.toml ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ [project]
2
+ name = "ragbot"
3
+ version = "0.1.0"
4
+ dependencies = []
5
+
6
+ [tool.setuptools]
7
+ packages = ["src", "llm"]
ragbot.egg-info/PKG-INFO ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ Metadata-Version: 2.4
2
+ Name: ragbot
3
+ Version: 0.1.0
ragbot.egg-info/SOURCES.txt ADDED
@@ -0,0 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ README.md
2
+ pyproject.toml
3
+ llm/__init__.py
4
+ llm/answer_generator.py
5
+ llm/base_llm.py
6
+ llm/query_refiner.py
7
+ ragbot.egg-info/PKG-INFO
8
+ ragbot.egg-info/SOURCES.txt
9
+ ragbot.egg-info/dependency_links.txt
10
+ ragbot.egg-info/top_level.txt
11
+ src/__init__.py
12
+ src/embedding.py
13
+ src/pipeline.py
14
+ src/preprocess.py
15
+ src/utils.py
ragbot.egg-info/dependency_links.txt ADDED
@@ -0,0 +1 @@
 
 
1
+
ragbot.egg-info/top_level.txt ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ llm
2
+ src
src/__init__.py ADDED
File without changes
src/__pycache__/__init__.cpython-310.pyc ADDED
Binary file (139 Bytes). View file
 
src/__pycache__/embedding.cpython-310.pyc ADDED
Binary file (1.75 kB). View file
 
src/__pycache__/pipeline.cpython-310.pyc ADDED
Binary file (4.75 kB). View file
 
src/__pycache__/preprocess.cpython-310.pyc ADDED
Binary file (1.26 kB). View file
 
src/__pycache__/utils.cpython-310.pyc ADDED
Binary file (1.85 kB). View file
 
src/embedding.py ADDED
@@ -0,0 +1,40 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from typing import Union, List
2
+ from langchain.embeddings import HuggingFaceEmbeddings
3
+
4
+ class EmbeddingModel:
5
+ """
6
+ A flexible embedding model wrapper supporting multiple backend models.
7
+ """
8
+
9
+ def __init__(self, model_type: str = "huggingface", model_name: str = "all-MiniLM-L6-v2"):
10
+ self.model_type = model_type
11
+ self.model_name = model_name
12
+ self.model = self._load_model()
13
+
14
+ def _load_model(self):
15
+ if self.model_type == "huggingface":
16
+ return HuggingFaceEmbeddings(model_name=self.model_name)
17
+
18
+ # Implementation for other model types can be added here
19
+
20
+ else:
21
+ raise ValueError(f"Unsupported model type: {self.model_type}")
22
+
23
+ def embed(self, text: Union[str, List[str]]):
24
+ """
25
+ Generate embeddings for the given text.
26
+
27
+ :param text: A string or list of strings.
28
+ :return: A list of embeddings.
29
+ """
30
+ if self.model_type == "huggingface":
31
+
32
+ if isinstance(text, list):
33
+ return [self.model.embed_query(t) for t in text]
34
+ return self.model.embed_query(text)
35
+
36
+ elif self.model_type == "sentence_transformers":
37
+ return self.model.encode(text, convert_to_tensor=True).tolist()
38
+
39
+ else:
40
+ raise NotImplementedError(f"Embedding for {self.model_type} is not implemented.")
src/pipeline.py ADDED
@@ -0,0 +1,150 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from src.utils import load_config
2
+ from dotenv import load_dotenv
3
+ from src.utils import get_pdf_from_url
4
+ from src.preprocess import Preprocessor
5
+ from src.embedding import EmbeddingModel
6
+ from src.utils import extract_text_from_pdf
7
+ from langchain.vectorstores import Chroma
8
+ from llm.answer_generator import GroqAnswerGenerator
9
+ from langchain.text_splitter import RecursiveCharacterTextSplitter
10
+ from llm.query_refiner import QueryRefiner
11
+
12
+
13
+ class ChatPipeline:
14
+ def __init__(self, arxiv_id:str=None):
15
+
16
+ self.arxiv_id = None
17
+ self.config = load_config()
18
+ self.chatbot_config = load_config("./configs/llm_producer.yaml")
19
+ self.chunks = None
20
+ self.retriever = None
21
+
22
+ def _preprocess_docs(self, docs):
23
+ """
24
+ Preprocess the input text using the Preprocessor class.
25
+
26
+ Args:
27
+ text (str): The text to preprocess.
28
+
29
+ Returns:
30
+ str: The preprocessed text.
31
+ """
32
+ if not docs:
33
+ raise ValueError("No documents provided for preprocessing.")
34
+ if not isinstance(docs, list):
35
+ raise TypeError("Expected a list of documents for preprocessing.")
36
+ if not all(hasattr(doc, 'page_content') for doc in docs):
37
+ raise ValueError("All documents must have a 'page_content' attribute.")
38
+
39
+
40
+ preprocessor = Preprocessor()
41
+
42
+ for i, doc in enumerate(docs):
43
+ doc.page_content = preprocessor(doc.page_content)
44
+ return docs
45
+
46
+ def _create_chunks(self, docs):
47
+ """
48
+ Create chunks from the preprocessed documents.
49
+
50
+ Args:
51
+ docs (list): List of preprocessed documents.
52
+
53
+ Returns:
54
+ list: List of document chunks.
55
+ """
56
+
57
+ text_splitter = RecursiveCharacterTextSplitter(
58
+ chunk_size=self.config["text_splitter"]["chunk_size"],
59
+ chunk_overlap=self.config["text_splitter"]["chunk_overlap"]
60
+ )
61
+
62
+ return text_splitter.split_documents(docs)
63
+
64
+ def _create_vector_store(self, chunks):
65
+ """
66
+ Create a vector store from the document chunks.
67
+
68
+ Args:
69
+ chunks (list): List of document chunks.
70
+
71
+ Returns:
72
+ VectorStore: The created vector store.
73
+ """
74
+ embedding_model = EmbeddingModel(model_type=self.config['embedding']['model_type'],
75
+ model_name=self.config['embedding']['model_name'])
76
+ vector_store = Chroma.from_documents(
77
+ documents=chunks,
78
+ embedding=embedding_model.model,
79
+ persist_directory=self.config['vector_db']['path']
80
+ )
81
+ vector_store.persist()
82
+ self.retriever = vector_store.as_retriever(search_kwargs=self.config['vector_db']['search_kwargs'])
83
+
84
+ def setup(self, arxiv_id:str):
85
+ """
86
+ Setup the pipeline by loading necessary configurations and resources.
87
+ """
88
+ self.arxiv_id = arxiv_id
89
+ if not self.arxiv_id:
90
+ raise ValueError("arxiv_id must be provided to setup the pipeline.")
91
+
92
+ self.query_refiner = QueryRefiner()
93
+
94
+ get_pdf_from_url(self.arxiv_id, self.config['storage']['save_pdf_path'])
95
+
96
+ documents = extract_text_from_pdf(f"{self.config['storage']['save_pdf_path']}/{self.arxiv_id}.pdf")
97
+
98
+ preprocessed_docs = self._preprocess_docs(documents)
99
+
100
+ self.chunks = self._create_chunks(preprocessed_docs)
101
+
102
+ self._create_vector_store(self.chunks)
103
+
104
+ self.chatbot = GroqAnswerGenerator(
105
+ model_name=self.chatbot_config['model_name'],
106
+ temperature=self.chatbot_config['temperature'],
107
+ max_tokens=self.chatbot_config['max_tokens'],
108
+ retriever=self.retriever
109
+ )
110
+
111
+ def setup_from_pdf(self, pdf_path: str):
112
+ """
113
+ Setup the pipeline using a local PDF file.
114
+ """
115
+ if not pdf_path:
116
+ raise ValueError("pdf_path must be provided to setup the pipeline.")
117
+
118
+ documents = extract_text_from_pdf(pdf_path)
119
+
120
+ preprocessed_docs = self._preprocess_docs(documents)
121
+
122
+ self.chunks = self._create_chunks(preprocessed_docs)
123
+
124
+ self._create_vector_store(self.chunks)
125
+
126
+ self.chatbot = GroqAnswerGenerator(
127
+ model_name=self.chatbot_config['model_name'],
128
+ temperature=self.chatbot_config['temperature'],
129
+ max_tokens=self.chatbot_config['max_tokens'],
130
+ retriever=self.retriever
131
+ )
132
+
133
+ def query(self, prompt: str, refine_query: bool = True):
134
+ """
135
+ Query the chatbot with a prompt.
136
+
137
+ Args:
138
+ prompt (str): The prompt to query the chatbot with.
139
+
140
+ Returns:
141
+ str: The response from the chatbot.
142
+ """
143
+ if not self.chatbot:
144
+ raise ValueError("Chatbot is not initialized. Call setup() method first.")
145
+
146
+ if refine_query:
147
+ refined_query = self.query_refiner.refine(prompt)
148
+ return self.chatbot.generate_answer(refined_query)
149
+ else:
150
+ return self.chatbot.generate_answer(prompt)
src/preprocess.py ADDED
@@ -0,0 +1,44 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import re
2
+
3
+ class Preprocessor:
4
+ """
5
+ A class for preprocessing text data.
6
+ This class provides to clean and normalize text data.
7
+ """
8
+
9
+ @staticmethod
10
+ def basic_preprocess(text):
11
+ """
12
+ Basic preprocessing of text data.
13
+ - Converts to lowercase
14
+ - Removes special characters and digits
15
+ - Strips leading and trailing whitespace
16
+ """
17
+ # Remove common strings like page numbers, arXiv mentions, etc.
18
+ text = re.sub(r'Page \d+|arXiv preprint.*|Copyright.*', '', text, flags=re.IGNORECASE)
19
+
20
+ # Merge single newlines within paragraphs, but keep double newlines as paragraph breaks
21
+ text = re.sub(r'(?<!\n)\n(?!\n)', ' ', text)
22
+
23
+ # Remove hyphenations at line breaks (like "exam-\nple" -> "example")
24
+
25
+ text = re.sub(r'-\s*\n', '', text)
26
+
27
+ # First remove newline after hyphen, then remove just the hyphen if it remains
28
+ text = re.sub(r'-\s+', '', text)
29
+
30
+ # Normalize extra spaces
31
+ text = re.sub(r'\s+', ' ', text)
32
+
33
+ # Strip leading/trailing whitespace
34
+ text = text.strip()
35
+
36
+ return text
37
+
38
+ def __call__(self, *args, **kwds):
39
+ """
40
+ Call method to apply basic preprocessing.
41
+ This allows the class instance to be used as a function.
42
+ """
43
+ return self.basic_preprocess(*args, **kwds)
44
+
src/utils.py ADDED
@@ -0,0 +1,61 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import yaml
3
+ import requests
4
+ from pathlib import Path
5
+ from langchain.document_loaders import PyPDFLoader
6
+
7
+ def get_pdf_from_url(arxiv_id: str, save_dir: str) -> str:
8
+ """
9
+ Downloads a PDF from arXiv given an ID, unless already downloaded.
10
+
11
+ Returns:
12
+ str: Path to the downloaded (or existing) PDF.
13
+ """
14
+ os.makedirs(save_dir, exist_ok=True)
15
+ pdf_path = os.path.join(save_dir, f"{arxiv_id}.pdf")
16
+
17
+ if os.path.exists(pdf_path):
18
+ # print(f"[cache] PDF already exists: {pdf_path}")
19
+ return pdf_path
20
+
21
+ url = f"https://arxiv.org/pdf/{arxiv_id}.pdf"
22
+
23
+ response = requests.get(url)
24
+ response.raise_for_status()
25
+
26
+ with open(pdf_path, "wb") as f:
27
+ f.write(response.content)
28
+
29
+ def load_config(config_path: str="./configs/pipeline.yaml") -> dict:
30
+ """
31
+ Load a YAML configuration file and return its contents as a dictionary.
32
+
33
+ Args:
34
+ config_path (str): The path to the YAML configuration file.
35
+
36
+ Returns:
37
+ dict: The contents of the configuration file.
38
+ """
39
+ config_path = Path(config_path)
40
+ if not config_path.exists():
41
+ raise FileNotFoundError(f"Configuration file {config_path} does not exist.")
42
+
43
+ with open(config_path, 'r') as file:
44
+ config = yaml.safe_load(file)
45
+
46
+ return config
47
+
48
+ def extract_text_from_pdf(pdf_path: str) -> str:
49
+ """
50
+ Extract text from a PDF file.
51
+
52
+ Args:
53
+ pdf_path (str): The path to the PDF file.
54
+
55
+ Returns:
56
+ str: The extracted text from the PDF.
57
+ """
58
+ loader = PyPDFLoader(pdf_path)
59
+ documents = loader.load()
60
+
61
+ return documents