Spaces:

mintaeng
/

fut_gradio

Sleeping

App Files Files Community

mintaeng commited on Jun 24, 2024

Commit

e6db427

verified ·

1 Parent(s): e99b871

Create pdfchatbot.py

Browse files

Files changed (1) hide show

pdfchatbot.py +193 -0

pdfchatbot.py ADDED Viewed

	@@ -0,0 +1,193 @@

+import yaml
+import fitz
+import torch
+import gradio as gr
+from PIL import Image
+from langchain.embeddings import HuggingFaceEmbeddings
+from langchain.vectorstores import Chroma
+from langchain.llms import HuggingFacePipeline
+from langchain.chains import ConversationalRetrievalChain
+from langchain.document_loaders import PyPDFLoader
+from langchain.prompts import PromptTemplate
+from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline
+class PDFChatBot:
+    def __init__(self, config_path="../config.yaml"):
+        """
+        Initialize the PDFChatBot instance.
+        Parameters:
+            config_path (str): Path to the configuration file (default is "../config.yaml").
+        """
+        self.processed = False
+        self.page = 0
+        self.chat_history = []
+        self.config = self.load_config(config_path)
+        # Initialize other attributes to None
+        self.prompt = None
+        self.documents = None
+        self.embeddings = None
+        self.vectordb = None
+        self.tokenizer = None
+        self.model = None
+        self.pipeline = None
+        self.chain = None
+    def load_config(self, file_path):
+        """
+        Load configuration from a YAML file.
+        Parameters:
+            file_path (str): Path to the YAML configuration file.
+        Returns:
+            dict: Configuration as a dictionary.
+        """
+        with open(file_path, 'r') as stream:
+            try:
+                config = yaml.safe_load(stream)
+                return config
+            except yaml.YAMLError as exc:
+                print(f"Error loading configuration: {exc}")
+                return None
+    def add_text(self, history, text):
+        """
+        Add user-entered text to the chat history.
+        Parameters:
+            history (list): List of chat history tuples.
+            text (str): User-entered text.
+        Returns:
+            list: Updated chat history.
+        """
+        if not text:
+            raise gr.Error('Enter text')
+        history.append((text, ''))
+        return history
+    def create_prompt_template(self):
+        """
+        Create a prompt template for the chatbot.
+        """
+        template = (
+            f"The assistant should provide detailed explanations."
+            "Combine the chat history and follow up question into "
+            "Follow up question: What is this"
+        )
+        self.prompt = PromptTemplate.from_template(template)
+    def load_embeddings(self):
+        """
+        Load embeddings from Hugging Face and set in the config file.
+        """
+        self.embeddings = HuggingFaceEmbeddings(model_name=self.config.get("modelEmbeddings"))
+    def load_vectordb(self):
+        """
+        Load the vector database from the documents and embeddings.
+        """
+        self.vectordb = Chroma.from_documents(self.documents, self.embeddings)
+    def load_tokenizer(self):
+        """
+        Load the tokenizer from Hugging Face and set in the config file.
+        """
+        self.tokenizer = AutoTokenizer.from_pretrained(self.config.get("autoTokenizer"))
+    def load_model(self):
+        """
+        Load the causal language model from Hugging Face and set in the config file.
+        """
+        self.model = AutoModelForCausalLM.from_pretrained(
+            self.config.get("autoModelForCausalLM"),
+            device_map='auto',
+            torch_dtype=torch.float32,
+            token=True,
+            load_in_4bit=True
+        )
+    def create_pipeline(self):
+        """
+        Create a pipeline for text generation using the loaded model and tokenizer.
+        """
+        pipe = pipeline(
+            model=self.model,
+            task='text-generation',
+            tokenizer=self.tokenizer,
+            max_new_tokens=200
+        )
+        self.pipeline = HuggingFacePipeline(pipeline=pipe)
+    def create_chain(self):
+        """
+        Create a Conversational Retrieval Chain
+        """
+        self.chain = ConversationalRetrievalChain.from_llm(
+            self.pipeline,
+            chain_type="stuff",
+            retriever=self.vectordb.as_retriever(search_kwargs={"k": 1}),
+            condense_question_prompt=self.prompt,
+            return_source_documents=True
+        )
+    def process_file(self, file):
+        """
+        Process the uploaded PDF file and initialize necessary components: Tokenizer, VectorDB and LLM.
+        Parameters:
+            file (FileStorage): The uploaded PDF file.
+        """
+        self.create_prompt_template()
+        self.documents = PyPDFLoader(file.name).load()
+        self.load_embeddings()
+        self.load_vectordb()
+        self.load_tokenizer()
+        self.load_model()
+        self.create_pipeline()
+        self.create_chain()
+    def generate_response(self, history, query, file):
+        """
+        Generate a response based on user query and chat history.
+        Parameters:
+            history (list): List of chat history tuples.
+            query (str): User's query.
+            file (FileStorage): The uploaded PDF file.
+        Returns:
+            tuple: Updated chat history and a space.
+        """
+        if not query:
+            raise gr.Error(message='Submit a question')
+        if not file:
+            raise gr.Error(message='Upload a PDF')
+        if not self.processed:
+            self.process_file(file)
+            self.processed = True
+        result = self.chain({"question": query, 'chat_history': self.chat_history}, return_only_outputs=True)
+        self.chat_history.append((query, result["answer"]))
+        self.page = list(result['source_documents'][0])[1][1]['page']
+        for char in result['answer']:
+            history[-1][-1] += char
+        return history, " "
+    def render_file(self, file):
+        """
+        Renders a specific page of a PDF file as an image.
+        Parameters:
+            file (FileStorage): The PDF file.
+        Returns:
+            PIL.Image.Image: The rendered page as an image.
+        """
+        doc = fitz.open(file.name)
+        page = doc[self.page]
+        pix = page.get_pixmap(matrix=fitz.Matrix(300 / 72, 300 / 72))
+        image = Image.frombytes('RGB', [pix.width, pix.height], pix.samples)
+        return image