|
import yaml |
|
import fitz |
|
import torch |
|
import gradio as gr |
|
from PIL import Image |
|
from langchain.embeddings import HuggingFaceEmbeddings |
|
from langchain.vectorstores import Chroma |
|
from langchain.llms import HuggingFacePipeline |
|
from langchain.chains import ConversationalRetrievalChain |
|
from langchain.document_loaders import PyPDFLoader |
|
from langchain.prompts import PromptTemplate |
|
from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline |
|
|
|
|
|
HUGGINGFACE_TOKEN = gr.Textbox() |
|
|
|
class PDFChatBot: |
|
def __init__(self, config_path="config.yaml"): |
|
""" |
|
Initialize the PDFChatBot instance. |
|
|
|
Parameters: |
|
config_path (str): Path to the configuration file (default is "config.yaml"). |
|
""" |
|
self.processed = False |
|
self.page = 0 |
|
self.chat_history = [] |
|
self.config = self.load_config(config_path) |
|
|
|
self.prompt = None |
|
self.documents = None |
|
self.embeddings = None |
|
self.vectordb = None |
|
self.tokenizer = None |
|
self.model = None |
|
self.pipeline = None |
|
self.chain = None |
|
|
|
def load_config(self, file_path): |
|
""" |
|
Load configuration from a YAML file. |
|
|
|
Parameters: |
|
file_path (str): Path to the YAML configuration file. |
|
|
|
Returns: |
|
dict: Configuration as a dictionary. |
|
""" |
|
with open(file_path, 'r') as stream: |
|
try: |
|
config = yaml.safe_load(stream) |
|
return config |
|
except yaml.YAMLError as exc: |
|
print(f"Error loading configuration: {exc}") |
|
return None |
|
|
|
def add_text(self, history, text): |
|
""" |
|
Add user-entered text to the chat history. |
|
|
|
Parameters: |
|
history (list): List of chat history tuples. |
|
text (str): User-entered text. |
|
|
|
Returns: |
|
list: Updated chat history. |
|
""" |
|
if not text: |
|
raise gr.Error('Enter text') |
|
history.append((text, '')) |
|
return history |
|
|
|
def create_prompt_template(self): |
|
""" |
|
Create a prompt template for the chatbot. |
|
""" |
|
template = ( |
|
f"The assistant should provide detailed explanations." |
|
"Combine the chat history and follow up question into " |
|
"Follow up question: What is this" |
|
) |
|
self.prompt = PromptTemplate.from_template(template) |
|
|
|
def load_embeddings(self): |
|
""" |
|
Load embeddings from Hugging Face and set in the config file. |
|
""" |
|
self.embeddings = HuggingFaceEmbeddings(model_name=self.config.get("modelEmbeddings")) |
|
|
|
def load_vectordb(self): |
|
""" |
|
Load the vector database from the documents and embeddings. |
|
""" |
|
self.vectordb = Chroma.from_documents(self.documents, self.embeddings) |
|
|
|
def load_tokenizer(self): |
|
""" |
|
Load the tokenizer from Hugging Face and set in the config file. |
|
""" |
|
self.tokenizer = AutoTokenizer.from_pretrained( |
|
self.config.get("autoTokenizer"), |
|
use_auth_token=HUGGINGFACE_TOKEN |
|
) |
|
|
|
def load_model(self): |
|
""" |
|
Load the causal language model from Hugging Face and set in the config file. |
|
""" |
|
self.model = AutoModelForCausalLM.from_pretrained( |
|
self.config.get("autoModelForCausalLM"), |
|
device_map='auto', |
|
torch_dtype=torch.float32, |
|
use_auth_token=HUGGINGFACE_TOKEN, |
|
load_in_8bit=False |
|
) |
|
|
|
def create_pipeline(self): |
|
""" |
|
Create a pipeline for text generation using the loaded model and tokenizer. |
|
""" |
|
pipe = pipeline( |
|
model=self.model, |
|
task='text-generation', |
|
tokenizer=self.tokenizer, |
|
max_new_tokens=200 |
|
) |
|
self.pipeline = HuggingFacePipeline(pipeline=pipe) |
|
|
|
def create_chain(self): |
|
""" |
|
Create a Conversational Retrieval Chain |
|
""" |
|
self.chain = ConversationalRetrievalChain.from_llm( |
|
self.pipeline, |
|
chain_type="stuff", |
|
retriever=self.vectordb.as_retriever(search_kwargs={"k": 1}), |
|
condense_question_prompt=self.prompt, |
|
return_source_documents=True |
|
) |
|
|
|
def process_file(self, file): |
|
""" |
|
Process the uploaded PDF file and initialize necessary components: Tokenizer, VectorDB and LLM. |
|
|
|
Parameters: |
|
file (FileStorage): The uploaded PDF file. |
|
""" |
|
self.create_prompt_template() |
|
self.documents = PyPDFLoader(file.name).load() |
|
self.load_embeddings() |
|
self.load_vectordb() |
|
self.load_tokenizer() |
|
self.load_model() |
|
self.create_pipeline() |
|
self.create_chain() |
|
|
|
def generate_response(self, history, query, file): |
|
""" |
|
Generate a response based on user query and chat history. |
|
|
|
Parameters: |
|
history (list): List of chat history tuples. |
|
query (str): User's query. |
|
file (FileStorage): The uploaded PDF file. |
|
|
|
Returns: |
|
tuple: Updated chat history and a space. |
|
""" |
|
if not query: |
|
raise gr.Error(message='Submit a question') |
|
if not file: |
|
raise gr.Error(message='Upload a PDF') |
|
if not self.processed: |
|
self.process_file(file) |
|
self.processed = True |
|
|
|
result = self.chain({"question": query, 'chat_history': self.chat_history}, return_only_outputs=True) |
|
self.chat_history.append((query, result["answer"])) |
|
self.page = list(result['source_documents'][0])[1][1]['page'] |
|
|
|
for char in result['answer']: |
|
history[-1][-1] += char |
|
return history, " " |
|
|
|
def render_file(self, file): |
|
""" |
|
Renders a specific page of a PDF file as an image. |
|
|
|
Parameters: |
|
file (FileStorage): The PDF file. |
|
|
|
Returns: |
|
PIL.Image.Image: The rendered page as an image. |
|
""" |
|
doc = fitz.open(file.name) |
|
page = doc[self.page] |
|
pix = page.get_pixmap(matrix=fitz.Matrix(300 / 72, 300 / 72)) |
|
image = Image.frombytes('RGB', [pix.width, pix.height], pix.samples) |
|
return image |
|
|