Spaces:

vinny4
/

ScholarBot

Running

App Files Files Community

vinny4 commited on Jun 26

Commit

9c37331

0 Parent(s):

initial commit

Browse files

Files changed (33) hide show

.gitattributes +3 -0
.gitignore +2 -0
Notebooks/scratchpad.ipynb +141 -0
Notebooks/tutorials/RAG_basic.ipynb +0 -0
Notebooks/tutorials/ScholarBot.ipynb +0 -0
README.md +7 -0
app/app.py +63 -0
configs/llm_producer.yaml +13 -0
configs/llm_refiner.yaml +6 -0
configs/pipeline.yaml +17 -0
llm/__init__.py +0 -0
llm/__pycache__/__init__.cpython-310.pyc +0 -0
llm/__pycache__/answer_generator.cpython-310.pyc +0 -0
llm/__pycache__/base_llm.cpython-310.pyc +0 -0
llm/__pycache__/query_refiner.cpython-310.pyc +0 -0
llm/answer_generator.py +38 -0
llm/base_llm.py +12 -0
llm/query_refiner.py +20 -0
pyproject.toml +7 -0
ragbot.egg-info/PKG-INFO +3 -0
ragbot.egg-info/SOURCES.txt +15 -0
ragbot.egg-info/dependency_links.txt +1 -0
ragbot.egg-info/top_level.txt +2 -0
src/__init__.py +0 -0
src/__pycache__/__init__.cpython-310.pyc +0 -0
src/__pycache__/embedding.cpython-310.pyc +0 -0
src/__pycache__/pipeline.cpython-310.pyc +0 -0
src/__pycache__/preprocess.cpython-310.pyc +0 -0
src/__pycache__/utils.cpython-310.pyc +0 -0
src/embedding.py +40 -0
src/pipeline.py +150 -0
src/preprocess.py +44 -0
src/utils.py +61 -0

.gitattributes ADDED Viewed

	@@ -0,0 +1,3 @@

+*.pdf filter=lfs diff=lfs merge=lfs -text
+*.sqlite3 filter=lfs diff=lfs merge=lfs -text
+*.bin filter=lfs diff=lfs merge=lfs -text

.gitignore ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ .env
2	+ data/

Notebooks/scratchpad.ipynb ADDED Viewed

	@@ -0,0 +1,141 @@

+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "id": "04cabe4c",
+   "metadata": {},
+   "source": [
+    "Uncommend and run if dependencies are not installed"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "id": "cc4d2b9b",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# !pip install -q pyyaml\n",
+    "# !pip install -q requests\n",
+    "# !pip install -q dotenv\n",
+    "# !pip install -qU langchain-community\n",
+    "# !pip install -q pypdf\n",
+    "# %pip install -qU langchain-groq\n",
+    "# !pip install -q chromadb\n",
+    "# !pip install -q sentence-transformers"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "id": "7cdfaebc",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import sys\n",
+    "import os\n",
+    "\n",
+    "project_root = os.path.abspath(\"..\")  # adjust this depending on where your notebook lives\n",
+    "if project_root not in sys.path:\n",
+    "    sys.path.insert(0, project_root)\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "id": "72e187e0",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from src.pipeline import ChatPipeline"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "id": "f79416f1",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from src.utils import load_config"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "id": "ba557b13",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "cp = ChatPipeline()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "id": "49dc2580",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "d:\\Thesis\\Vinayak Rana\\LLM\\RAG\\src\\embedding.py:16: LangChainDeprecationWarning: The class `HuggingFaceEmbeddings` was deprecated in LangChain 0.2.2 and will be removed in 1.0. An updated version of the class exists in the :class:`~langchain-huggingface package and should be used instead. To use it run `pip install -U :class:`~langchain-huggingface` and import as `from :class:`~langchain_huggingface import HuggingFaceEmbeddings``.\n",
+      "  return HuggingFaceEmbeddings(model_name=self.model_name)\n",
+      "c:\\Users\\vinny\\Miniconda3\\envs\\scholarchatbot\\lib\\site-packages\\tqdm\\auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n",
+      "  from .autonotebook import tqdm as notebook_tqdm\n",
+      "d:\\Thesis\\Vinayak Rana\\LLM\\RAG\\src\\pipeline.py:79: LangChainDeprecationWarning: Since Chroma 0.4.x the manual persistence method is no longer supported as docs are automatically persisted.\n",
+      "  vector_store.persist()\n",
+      "d:\\Thesis\\Vinayak Rana\\LLM\\RAG\\llm\\answer_generator.py:23: LangChainDeprecationWarning: Please see the migration guide at: https://python.langchain.com/docs/versions/migrating_memory/\n",
+      "  self.memory = ConversationBufferWindowMemory(\n"
+     ]
+    }
+   ],
+   "source": [
+    "cp.setup(arxiv_id=\"2407.05040\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "id": "ca77354b",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "'Based on the provided context, here\\'s a differentiation between Self-Instruct, Evol-Instruct, and OSSInstruct:\\n\\n1. **Self-Instruct**: This technique is used to align language models with self-generated instructions. It involves generating instruction-following data points through the Self-Instruct technique, which is utilized in Codealpaca and CodeLlama. The Self-Instruct technique is described in the paper \"Self-instruct: Aligning language models with self-generated instructions\" by Yizhong Wang et al. (2022).\\n\\n2. **Evol-Instruct**: This technique is used to evolve instruction-following data in both depth and breadth dimensions. It is employed in Wizardcoder to further evolve the Codealpaca dataset. The Evol-Instruct method is described in the paper \"EvolInstruct\" by Can Xu et al. (2023a).\\n\\n3. **OSSInstruct**: This technique is used to create instruction-following data from unlabeled open-source code snippets. It is employed in Magicoder to construct a method. The OSSInstruct technique is not described in detail in the provided context, but it is mentioned as a distinct method used in Magicoder.\\n\\nIn summary, Self-Instruct generates instruction-following data points, Evol-Instruct evolves instruction-following data, and OSSInstruct creates instruction-following data from open-source code snippets.'"
+      ]
+     },
+     "execution_count": 7,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "cp.query(\"can you differentiate between self instruct , evol instruct and OSS ?\")"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "scholarchatbot",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.10.18"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}

Notebooks/tutorials/RAG_basic.ipynb ADDED Viewed

The diff for this file is too large to render. See raw diff

Notebooks/tutorials/ScholarBot.ipynb ADDED Viewed

The diff for this file is too large to render. See raw diff

README.md ADDED Viewed

	@@ -0,0 +1,7 @@

+# RAG
+## What should be the input format?
+* Extract text from pdf?
+* Use .tex file in submission?
+##### The latex format might not be very good for the LLM (specially when it is a smaller LLM). So, extracting text from pdf would be better.
+we could bring it latex if the output doesn't seem to be good enough. Or we are missing out the mathematical equations.

app/app.py ADDED Viewed

	@@ -0,0 +1,63 @@

+import os
+import tempfile
+import streamlit as st
+import sys
+import os
+from dotenv import load_dotenv
+load_dotenv()
+project_root = os.path.abspath(os.path.join(os.path.dirname(__file__), ".."))       # dirty fix ----> can fix this with pip install -e .
+if project_root not in sys.path:
+    sys.path.insert(0, project_root)
+from src.pipeline import ChatPipeline
+st.set_page_config(page_title="ScholarBot", layout="wide")
+st.title("ScholarBot: Chat with Research Papers")
+if "chat_pipeline" not in st.session_state:
+    st.session_state.chat_pipeline = None
+if "chat_history" not in st.session_state:
+    st.session_state.chat_history = []
+st.sidebar.header("Input Paper")
+input_method = st.sidebar.radio("Choose input method:", ("Upload PDF", "arXiv ID"))
+refine_query = st.sidebar.checkbox("Refine query before answering?", value=True)
+if input_method == "Upload PDF":
+    uploaded_file = st.sidebar.file_uploader("Upload a PDF file", type=["pdf"])
+    if uploaded_file is not None:
+        with tempfile.NamedTemporaryFile(delete=False, suffix=".pdf") as tmp_file:
+            tmp_file.write(uploaded_file.read())
+            pdf_path = tmp_file.name
+        st.info("Setting up ScholarBot...")
+        st.session_state.chat_pipeline = ChatPipeline()
+        st.session_state.chat_pipeline.setup_from_pdf(pdf_path)
+        st.success("PDF loaded and indexed successfully!")
+else:
+    arxiv_id = st.sidebar.text_input("Enter arXiv ID:")
+    if st.sidebar.button("Load Paper") and arxiv_id:
+        st.info("Setting up ScholarBot...")
+        st.session_state.chat_pipeline = ChatPipeline()
+        st.session_state.chat_pipeline.setup(arxiv_id=arxiv_id)
+        st.success(f"arXiv paper {arxiv_id} loaded successfully!")
+st.subheader("Chat with the Paper")
+user_input = st.text_input("Ask a question:", placeholder="e.g. What is the JointMI acquisition function?")
+if st.button("Generate Answer") and user_input:
+    if st.session_state.chat_pipeline:
+        answer = st.session_state.chat_pipeline.query(user_input, refine_query=refine_query)
+        st.session_state.chat_history.append((user_input, answer))
+    else:
+        st.warning("Please load a paper first.")
+if st.session_state.chat_history:
+    st.markdown("---")
+    st.subheader("📜 Chat History")
+    for q, a in st.session_state.chat_history[::-1]:
+        st.markdown(f"**You:** {q}")
+        st.markdown(f"**ScholarBot:** {a}")
+        st.markdown("---")

configs/llm_producer.yaml ADDED Viewed

	@@ -0,0 +1,13 @@

+model_name: llama-3.1-8b-instant
+temperature: 0.2
+max_tokens: 512
+memory_window: 3
+prompt_template: |
+  You are a helpful research assistant. Use the context below to answer the question.
+  If the answer is not in the context, say "I don't know."
+  Context:
+  {context}
+  Question:
+  {question}

configs/llm_refiner.yaml ADDED Viewed

	@@ -0,0 +1,6 @@

+model_name: llama-3.1-8b-instant
+temperature: 0.3
+max_tokens: 100
+system_prompt: |
+  You are a query refining assistant. Improve the user's question to be more specific, clear, and relevant for a technical document search.
+  Preserve the original meaning. Avoid adding new facts. Use formal language if needed.

configs/pipeline.yaml ADDED Viewed

	@@ -0,0 +1,17 @@

+storage:
+  save_pdf_path: ./data/pdfs
+  persist_vector_db: True
+  vector_db_path: ./data/vector_db
+text_splitter:
+  chunk_size: 1000
+  chunk_overlap: 200
+embedding:
+  model_name: all-MiniLM-L6-v2
+  model_type: huggingface
+vector_db:
+  path: "./data/vector_db/chroma_db"
+  search_kwargs:
+    "k": 3

llm/__init__.py ADDED Viewed

File without changes

llm/__pycache__/__init__.cpython-310.pyc ADDED Viewed

Binary file (139 Bytes). View file

llm/__pycache__/answer_generator.cpython-310.pyc ADDED Viewed

Binary file (1.5 kB). View file

llm/__pycache__/base_llm.cpython-310.pyc ADDED Viewed

Binary file (666 Bytes). View file

llm/__pycache__/query_refiner.cpython-310.pyc ADDED Viewed

Binary file (1.03 kB). View file

llm/answer_generator.py ADDED Viewed

	@@ -0,0 +1,38 @@

+from llm.base_llm import BaseLLM
+from src.utils import load_config
+from langchain_groq import ChatGroq
+from langchain.prompts import PromptTemplate
+from langchain.memory import ConversationBufferWindowMemory
+from langchain.chains import ConversationalRetrievalChain
+class GroqAnswerGenerator(BaseLLM):
+    def __init__(self, model_name: str, temperature: float, max_tokens: int, retriever=None):
+        self.retriever = retriever
+        self.config = load_config("./configs/llm_producer.yaml")
+        self.model = ChatGroq(
+            model=model_name,
+            temperature=temperature,
+            max_tokens=max_tokens
+        )
+        self.prompt_template = PromptTemplate.from_template(
+            self.config["prompt_template"]
+        )
+        self.memory = ConversationBufferWindowMemory(
+            memory_key="chat_history",  # required by ConversationalRetrievalChain
+            return_messages=True,
+            k=self.config["memory_window"],
+        )
+        self.qa_chain = ConversationalRetrievalChain.from_llm(
+            llm=self.model,
+            retriever=self.retriever,
+            memory=self.memory,
+            chain_type="stuff",
+            combine_docs_chain_kwargs={
+                "prompt": self.prompt_template}
+        )
+    def generate_answer(self, prompt: str):
+        return self.qa_chain.run(question=prompt)

llm/base_llm.py ADDED Viewed

	@@ -0,0 +1,12 @@

+from abc import abstractmethod
+class BaseLLM:
+    """
+    Base class for all LLMs (Large Language Models).
+    """
+    @abstractmethod
+    def generate_answer(self, question: str):
+        """
+        This is an abstract method that must be implemented by subclasses.
+        """
+        pass

llm/query_refiner.py ADDED Viewed

	@@ -0,0 +1,20 @@

+from langchain_groq import ChatGroq
+from src.utils import load_config
+from langchain.prompts import ChatPromptTemplate
+class QueryRefiner:
+    def __init__(self):
+        config = load_config("./configs/llm_refiner.yaml")
+        self.model = ChatGroq(
+            model=config["model_name"],
+            temperature=config["temperature"],
+            max_tokens=config["max_tokens"]
+        )
+        self.prompt = ChatPromptTemplate.from_messages([
+            ("system", config["system_prompt"]),
+            ("human", "{query}")
+        ])
+    def refine(self, query: str):
+        chain = self.prompt | self.model
+        return chain.invoke({"query": query}).content

pyproject.toml ADDED Viewed

	@@ -0,0 +1,7 @@

+[project]
+name = "ragbot"
+version = "0.1.0"
+dependencies = []
+[tool.setuptools]
+packages = ["src", "llm"]

ragbot.egg-info/PKG-INFO ADDED Viewed

	@@ -0,0 +1,3 @@

+Metadata-Version: 2.4
+Name: ragbot
+Version: 0.1.0

ragbot.egg-info/SOURCES.txt ADDED Viewed

	@@ -0,0 +1,15 @@

+README.md
+pyproject.toml
+llm/__init__.py
+llm/answer_generator.py
+llm/base_llm.py
+llm/query_refiner.py
+ragbot.egg-info/PKG-INFO
+ragbot.egg-info/SOURCES.txt
+ragbot.egg-info/dependency_links.txt
+ragbot.egg-info/top_level.txt
+src/__init__.py
+src/embedding.py
+src/pipeline.py
+src/preprocess.py
+src/utils.py

ragbot.egg-info/dependency_links.txt ADDED Viewed

	@@ -0,0 +1 @@


1	+

ragbot.egg-info/top_level.txt ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ llm
2	+ src

src/__init__.py ADDED Viewed

File without changes

src/__pycache__/__init__.cpython-310.pyc ADDED Viewed

Binary file (139 Bytes). View file

src/__pycache__/embedding.cpython-310.pyc ADDED Viewed

Binary file (1.75 kB). View file

src/__pycache__/pipeline.cpython-310.pyc ADDED Viewed

Binary file (4.75 kB). View file

src/__pycache__/preprocess.cpython-310.pyc ADDED Viewed

Binary file (1.26 kB). View file

src/__pycache__/utils.cpython-310.pyc ADDED Viewed

Binary file (1.85 kB). View file

src/embedding.py ADDED Viewed

	@@ -0,0 +1,40 @@

+from typing import Union, List
+from langchain.embeddings import HuggingFaceEmbeddings
+class EmbeddingModel:
+    """
+    A flexible embedding model wrapper supporting multiple backend models.
+    """
+    def __init__(self, model_type: str = "huggingface", model_name: str = "all-MiniLM-L6-v2"):
+        self.model_type = model_type
+        self.model_name = model_name
+        self.model = self._load_model()
+    def _load_model(self):
+        if self.model_type == "huggingface":
+            return HuggingFaceEmbeddings(model_name=self.model_name)
+        # Implementation for other model types can be added here
+        else:
+            raise ValueError(f"Unsupported model type: {self.model_type}")
+    def embed(self, text: Union[str, List[str]]):
+        """
+        Generate embeddings for the given text.
+        :param text: A string or list of strings.
+        :return: A list of embeddings.
+        """
+        if self.model_type == "huggingface":
+            if isinstance(text, list):
+                return [self.model.embed_query(t) for t in text]
+            return self.model.embed_query(text)
+        elif self.model_type == "sentence_transformers":
+            return self.model.encode(text, convert_to_tensor=True).tolist()
+        else:
+            raise NotImplementedError(f"Embedding for {self.model_type} is not implemented.")

src/pipeline.py ADDED Viewed

	@@ -0,0 +1,150 @@

+from src.utils import load_config
+from dotenv import load_dotenv
+from src.utils import get_pdf_from_url
+from src.preprocess import Preprocessor
+from src.embedding import EmbeddingModel
+from src.utils import extract_text_from_pdf
+from langchain.vectorstores import Chroma
+from llm.answer_generator import GroqAnswerGenerator
+from langchain.text_splitter import RecursiveCharacterTextSplitter
+from llm.query_refiner import QueryRefiner
+class ChatPipeline:
+    def __init__(self, arxiv_id:str=None):
+        self.arxiv_id = None
+        self.config = load_config()
+        self.chatbot_config = load_config("./configs/llm_producer.yaml")
+        self.chunks = None
+        self.retriever = None
+    def _preprocess_docs(self, docs):
+        """
+        Preprocess the input text using the Preprocessor class.
+        Args:
+            text (str): The text to preprocess.
+        Returns:
+            str: The preprocessed text.
+        """
+        if not docs:
+            raise ValueError("No documents provided for preprocessing.")
+        if not isinstance(docs, list):
+            raise TypeError("Expected a list of documents for preprocessing.")
+        if not all(hasattr(doc, 'page_content') for doc in docs):
+            raise ValueError("All documents must have a 'page_content' attribute.")
+        preprocessor = Preprocessor()
+        for i, doc in enumerate(docs):
+            doc.page_content = preprocessor(doc.page_content)
+        return docs
+    def _create_chunks(self, docs):
+        """
+        Create chunks from the preprocessed documents.
+        Args:
+            docs (list): List of preprocessed documents.
+        Returns:
+            list: List of document chunks.
+        """
+        text_splitter = RecursiveCharacterTextSplitter(
+            chunk_size=self.config["text_splitter"]["chunk_size"],
+            chunk_overlap=self.config["text_splitter"]["chunk_overlap"]
+        )
+        return text_splitter.split_documents(docs)
+    def _create_vector_store(self, chunks):
+        """
+        Create a vector store from the document chunks.
+        Args:
+            chunks (list): List of document chunks.
+        Returns:
+            VectorStore: The created vector store.
+        """
+        embedding_model = EmbeddingModel(model_type=self.config['embedding']['model_type'],
+                                         model_name=self.config['embedding']['model_name'])
+        vector_store = Chroma.from_documents(
+            documents=chunks,
+            embedding=embedding_model.model,
+            persist_directory=self.config['vector_db']['path']
+        )
+        vector_store.persist()
+        self.retriever = vector_store.as_retriever(search_kwargs=self.config['vector_db']['search_kwargs'])
+    def setup(self, arxiv_id:str):
+        """
+        Setup the pipeline by loading necessary configurations and resources.
+        """
+        self.arxiv_id = arxiv_id
+        if not self.arxiv_id:
+            raise ValueError("arxiv_id must be provided to setup the pipeline.")
+        self.query_refiner = QueryRefiner()
+        get_pdf_from_url(self.arxiv_id, self.config['storage']['save_pdf_path'])
+        documents = extract_text_from_pdf(f"{self.config['storage']['save_pdf_path']}/{self.arxiv_id}.pdf")
+        preprocessed_docs = self._preprocess_docs(documents)
+        self.chunks = self._create_chunks(preprocessed_docs)
+        self._create_vector_store(self.chunks)
+        self.chatbot = GroqAnswerGenerator(
+            model_name=self.chatbot_config['model_name'],
+            temperature=self.chatbot_config['temperature'],
+            max_tokens=self.chatbot_config['max_tokens'],
+            retriever=self.retriever
+        )
+    def setup_from_pdf(self, pdf_path: str):
+        """
+        Setup the pipeline using a local PDF file.
+        """
+        if not pdf_path:
+            raise ValueError("pdf_path must be provided to setup the pipeline.")
+        documents = extract_text_from_pdf(pdf_path)
+        preprocessed_docs = self._preprocess_docs(documents)
+        self.chunks = self._create_chunks(preprocessed_docs)
+        self._create_vector_store(self.chunks)
+        self.chatbot = GroqAnswerGenerator(
+            model_name=self.chatbot_config['model_name'],
+            temperature=self.chatbot_config['temperature'],
+            max_tokens=self.chatbot_config['max_tokens'],
+            retriever=self.retriever
+        )
+    def query(self, prompt: str, refine_query: bool = True):
+        """
+        Query the chatbot with a prompt.
+        Args:
+            prompt (str): The prompt to query the chatbot with.
+        Returns:
+            str: The response from the chatbot.
+        """
+        if not self.chatbot:
+            raise ValueError("Chatbot is not initialized. Call setup() method first.")
+        if refine_query:
+            refined_query = self.query_refiner.refine(prompt)
+            return self.chatbot.generate_answer(refined_query)
+        else:
+            return self.chatbot.generate_answer(prompt)

src/preprocess.py ADDED Viewed

	@@ -0,0 +1,44 @@

+import re
+class Preprocessor:
+    """
+    A class for preprocessing text data.
+    This class provides to clean and normalize text data.
+    """
+    @staticmethod
+    def basic_preprocess(text):
+        """
+        Basic preprocessing of text data.
+        - Converts to lowercase
+        - Removes special characters and digits
+        - Strips leading and trailing whitespace
+        """
+        # Remove common strings like page numbers, arXiv mentions, etc.
+        text = re.sub(r'Page \d+|arXiv preprint.*|Copyright.*', '', text, flags=re.IGNORECASE)
+        # Merge single newlines within paragraphs, but keep double newlines as paragraph breaks
+        text = re.sub(r'(?<!\n)\n(?!\n)', ' ', text)
+        # Remove hyphenations at line breaks (like "exam-\nple" -> "example")
+        text = re.sub(r'-\s*\n', '', text)
+        # First remove newline after hyphen, then remove just the hyphen if it remains
+        text = re.sub(r'-\s+', '', text)
+        # Normalize extra spaces
+        text = re.sub(r'\s+', ' ', text)
+        # Strip leading/trailing whitespace
+        text = text.strip()
+        return text
+    def __call__(self, *args, **kwds):
+        """
+        Call method to apply basic preprocessing.
+        This allows the class instance to be used as a function.
+        """
+        return self.basic_preprocess(*args, **kwds)

src/utils.py ADDED Viewed

	@@ -0,0 +1,61 @@

+import os
+import yaml
+import requests
+from pathlib import Path
+from langchain.document_loaders import PyPDFLoader
+def get_pdf_from_url(arxiv_id: str, save_dir: str) -> str:
+    """
+    Downloads a PDF from arXiv given an ID, unless already downloaded.
+    Returns:
+        str: Path to the downloaded (or existing) PDF.
+    """
+    os.makedirs(save_dir, exist_ok=True)
+    pdf_path = os.path.join(save_dir, f"{arxiv_id}.pdf")
+    if os.path.exists(pdf_path):
+        # print(f"[cache] PDF already exists: {pdf_path}")
+        return pdf_path
+    url = f"https://arxiv.org/pdf/{arxiv_id}.pdf"
+    response = requests.get(url)
+    response.raise_for_status()
+    with open(pdf_path, "wb") as f:
+        f.write(response.content)
+def load_config(config_path: str="./configs/pipeline.yaml") -> dict:
+    """
+    Load a YAML configuration file and return its contents as a dictionary.
+    Args:
+        config_path (str): The path to the YAML configuration file.
+    Returns:
+        dict: The contents of the configuration file.
+    """
+    config_path = Path(config_path)
+    if not config_path.exists():
+        raise FileNotFoundError(f"Configuration file {config_path} does not exist.")
+    with open(config_path, 'r') as file:
+        config = yaml.safe_load(file)
+    return config
+def extract_text_from_pdf(pdf_path: str) -> str:
+    """
+    Extract text from a PDF file.
+    Args:
+        pdf_path (str): The path to the PDF file.
+    Returns:
+        str: The extracted text from the PDF.
+    """
+    loader = PyPDFLoader(pdf_path)
+    documents = loader.load()
+    return documents