Multimodal-PDF-Chatbot

Running

App Files Files Community

Multimodal-PDF-Chatbot / app.py

anand004

Create app.py

3f98f11 verified about 1 month ago

raw

history blame

14.5 kB

	import gradio as gr
	from unstructured.partition.pdf import partition_pdf
	import pymupdf
	from PIL import Image
	import numpy as np
	import io
	import pandas as pd
	from langchain.text_splitter import RecursiveCharacterTextSplitter
	import gc
	import torch
	import chromadb
	from chromadb.utils.embedding_functions import OpenCLIPEmbeddingFunction
	from chromadb.utils.data_loaders import ImageLoader
	from sentence_transformers import SentenceTransformer
	from chromadb.utils import embedding_functions
	from transformers import LlavaNextProcessor, LlavaNextForConditionalGeneration
	import base64
	from langchain_community.llms import HuggingFaceEndpoint
	from langchain import PromptTemplate
	import spaces

	if torch.cuda.is_available():
	processor = LlavaNextProcessor.from_pretrained("llava-hf/llava-v1.6-mistral-7b-hf")
	vision_model = LlavaNextForConditionalGeneration.from_pretrained(
	"llava-hf/llava-v1.6-mistral-7b-hf",
	torch_dtype=torch.float16,
	low_cpu_mem_usage=True,
	load_in_4bit=True,
	)


	def image_to_bytes(image):
	img_byte_arr = io.BytesIO()
	image.save(img_byte_arr, format="PNG")
	return base64.b64encode(img_byte_arr.getvalue()).decode("utf-8")


	@spaces.GPU
	def get_image_descriptions(images):
	torch.cuda.empty_cache()
	gc.collect()

	descriptions = []
	prompt = "[INST] <image>\nDescribe the image in a sentence [/INST]"

	for img in images:
	inputs = processor(prompt, img, return_tensors="pt").to("cuda:0")
	output = vision_model.generate(**inputs, max_new_tokens=100)
	descriptions.append(processor.decode(output[0], skip_special_tokens=True))
	return descriptions


	CSS = """
	#table_col {background-color: rgb(33, 41, 54);}
	"""


	def extract_pdfs(docs, doc_collection):
	if docs:
	doc_collection = []
	doc_collection.extend(docs)
	return (
	doc_collection,
	gr.Tabs(selected=1),
	pd.DataFrame([i.split("/")[-1] for i in list(docs)], columns=["Filename"]),
	)


	def extract_images(docs):
	images = []
	for doc_path in docs:
	doc = pymupdf.open(doc_path) # open a document

	for page_index in range(len(doc)): # iterate over pdf pages
	page = doc[page_index] # get the page
	image_list = page.get_images()

	for image_index, img in enumerate(
	image_list, start=1
	): # enumerate the image list
	xref = img[0] # get the XREF of the image
	pix = pymupdf.Pixmap(doc, xref) # create a Pixmap

	if pix.n - pix.alpha > 3: # CMYK: convert to RGB first
	pix = pymupdf.Pixmap(pymupdf.csRGB, pix)

	images.append(Image.open(io.BytesIO(pix.pil_tobytes("JPEG"))))
	return images


	# def get_vectordb(text, images, tables):
	def get_vectordb(text, images):
	client = chromadb.EphemeralClient()
	loader = ImageLoader()
	sentence_transformer_ef = embedding_functions.SentenceTransformerEmbeddingFunction(
	model_name="multi-qa-mpnet-base-dot-v1"
	)
	if "text_db" in [i.name for i in client.list_collections()]:
	client.delete_collection("text_db")
	if "image_db" in [i.name for i in client.list_collections()]:
	client.delete_collection("image_db")
	text_collection = client.get_or_create_collection(
	name="text_db",
	embedding_function=sentence_transformer_ef,
	data_loader=loader,
	)
	image_collection = client.get_or_create_collection(
	name="image_db",
	embedding_function=sentence_transformer_ef,
	data_loader=loader,
	metadata={"hnsw:space": "cosine"},
	)

	image_descriptions = get_image_descriptions(images)
	image_dict = [{"image": image_to_bytes(img) for img in images}]

	image_collection.add(
	ids=[str(i) for i in range(len(images))],
	documents=image_descriptions,
	metadatas=image_dict,
	)

	splitter = RecursiveCharacterTextSplitter(
	chunk_size=500,
	chunk_overlap=10,
	)

	docs = splitter.create_documents([text])
	doc_texts = [i.page_content for i in docs]
	text_collection.add(
	ids=[str(i) for i in list(range(len(doc_texts)))], documents=doc_texts
	)
	return client


	def extract_data_from_pdfs(docs, session, progress=gr.Progress()):
	if len(docs) == 0:
	raise gr.Error("No documents to process")
	progress(0, "Extracting Images")

	images = extract_images(docs)

	progress(0.25, "Extracting Text")

	strategy = "hi_res"
	model_name = "yolox"
	all_elements = []

	for doc in docs:
	elements = partition_pdf(
	filename=doc,
	strategy=strategy,
	infer_table_structure=True,
	model_name=model_name,
	)

	all_elements.extend(elements)

	all_text = ""

	# tables = []

	prev = None
	for i in all_elements:
	meta = i.to_dict()
	if meta["type"].lower() not in ["table", "figurecaption"]:
	if meta["type"].lower() in ["listitem", "title"]:
	all_text += "\n\n" + meta["text"] + "\n"
	else:
	all_text += meta["text"]
	elif meta["type"] == "Table":
	continue
	# tables.append(meta["metadata"]["text_as_html"])

	# html = "<br>".join(tables)
	# display = "<h3>Sample Tables</h3>" + "<br>".join(tables[:2])
	# html = gr.HTML(html)
	# vectordb = get_vectordb(all_text, images, tables)

	progress(0.5, "Generating image descriptions")
	image_descriptions = "\n".join(get_image_descriptions(images))

	progress(0.75, "Inserting data into vector database")
	vectordb = get_vectordb(all_text, images)

	progress(1, "Completed")
	session["processed"] = True
	return (
	vectordb,
	session,
	gr.Row(visible=True),
	all_text[:2000] + "...",
	# display,
	images[:2],
	"<h1 style='text-align: center'>Completed<h1>",
	# image_descriptions
	)


	sentence_transformer_ef = embedding_functions.SentenceTransformerEmbeddingFunction(
	model_name="multi-qa-mpnet-base-dot-v1"
	)


	def conversation(vectordb_client, msg, num_context, img_context, history):

	text_collection = vectordb_client.get_collection(
	"text_db", embedding_function=sentence_transformer_ef
	)
	image_collection = vectordb_client.get_collection(
	"image_db", embedding_function=sentence_transformer_ef
	)

	results = text_collection.query(
	query_texts=[msg], include=["documents"], n_results=num_context
	)["documents"][0]

	similar_images = image_collection.query(
	query_texts=[msg],
	include=["metadatas", "distances", "documents"],
	n_results=img_context,
	)
	img_links = [i["image"] for i in similar_images["metadatas"][0]]

	images_and_locs = [
	Image.open(io.BytesIO(base64.b64decode(i[1])))
	for i in zip(similar_images["distances"][0], img_links)
	]
	img_desc = "\n".join(similar_images["documents"][0])
	if len(img_links) == 0:
	img_desc = "No Images Are Provided"
	template = """
	Context:
	{context}

	Included Images:
	{images}

	Question:
	{question}

	Answer:

	"""
	prompt = PromptTemplate(template=template, input_variables=["context", "question"])
	context = "\n\n".join(results)
	response = llm(prompt.format(context=context, question=msg, images=img_desc))
	return history + [(msg, response)], context, images_and_locs


	def check_validity_and_llm(session_states):
	if session_states.get("processed", False) == True:
	return gr.Tabs(selected=2)
	raise gr.Error("Please extract data first")


	def get_stats(vectordb):
	eles = vectordb.get()
	# words =
	text_data = [f"Chunks: {len(eles)}", "HIII"]
	return "\n".join(text_data), "", ""


	llm = HuggingFaceEndpoint(
	repo_id="mistralai/Mixtral-8x7B-Instruct-v0.1",
	temperature=0.4,
	max_new_tokens=800,
	)

	with gr.Blocks(css=CSS) as demo:

	vectordb = gr.State()
	doc_collection = gr.State(value=[])
	session_states = gr.State(value={})
	gr.Markdown(
	"""<h2><center>Multimodal PDF Chatbot</center></h2>
	<h3><center><b>Interact With Your PDF Documents</b></center></h3>"""
	)
	gr.Markdown(
	"""<center><h3><b>Note: </b> This application leverages advanced Retrieval-Augmented Generation (RAG) techniques to provide context-aware responses from your PDF documents</center><h3><br>
	<center>Utilizing multimodal capabilities, this chatbot can interpret and answer queries based on both textual and visual information within your PDFs.</center>"""
	)
	gr.Markdown(
	"""
	<center><b>Warning: </b> Extracting text and images from your document and generating embeddings may take some time due to the use of OCR and multimodal LLMs for image description<center>
	"""
	)
	with gr.Tabs() as tabs:
	with gr.TabItem("Upload PDFs", id=0) as pdf_tab:
	with gr.Row():
	with gr.Column():
	documents = gr.File(
	file_count="multiple",
	file_types=["pdf"],
	interactive=True,
	label="Upload your PDF file/s",
	)
	pdf_btn = gr.Button(value="Next", elem_id="button1")

	with gr.TabItem("Extract Data", id=1) as preprocess:
	with gr.Row():
	with gr.Column():
	back_p1 = gr.Button(value="Back")
	with gr.Column():
	embed = gr.Button(value="Extract Data")
	with gr.Column():
	next_p1 = gr.Button(value="Next")

	with gr.Row() as row:
	with gr.Column():
	selected = gr.Dataframe(
	interactive=False,
	col_count=(1, "fixed"),
	headers=["Selected Files"],
	)
	with gr.Column(variant="panel"):
	prog = gr.HTML(
	value="<h1 style='text-align: center'>Click the 'Extract' button to extract data from PDFs<h1>"
	)

	with gr.Accordion("See Parts of Extracted Data", open=False):
	with gr.Column(visible=True) as sample_data:
	with gr.Row():
	with gr.Column():
	ext_text = gr.Textbox(
	label="Sample Extracted Text", lines=15
	)
	with gr.Column():
	images = gr.Gallery(
	label="Sample Extracted Images", columns=1, rows=2
	)

	# with gr.Row():
	# image_desc = gr.Textbox(label="Image Descriptions", interactive=False)
	# with gr.Row(variant="panel"):
	# ext_tables = gr.HTML("<h3>Sample Tables</h3>", label="Extracted Tables")

	# with gr.TabItem("Embeddings", id=3) as embed_tab:
	# with gr.Row():
	# with gr.Column():
	# back_p2 = gr.Button(value="Back")
	# with gr.Column():
	# view_stats = gr.Button(value="View Stats")
	# with gr.Column():
	# next_p2 = gr.Button(value="Next")

	# with gr.Row():
	# with gr.Column():
	# text_stats = gr.Textbox(label="Text Stats", interactive=False)
	# with gr.Column():
	# table_stats = gr.Textbox(label="Table Stats", interactive=False)
	# with gr.Column():
	# image_stats = gr.Textbox(label="Image Stats", interactive=False)

	with gr.TabItem("Chat", id=2) as chat_tab:
	with gr.Column():
	choice = gr.Radio(
	["chromaDB"],
	value="chromaDB",
	label="Vector Database",
	interactive=True,
	)
	num_context = gr.Slider(
	label="Number of text context elements",
	minimum=1,
	maximum=20,
	step=1,
	interactive=True,
	value=3,
	)
	img_context = gr.Slider(
	label="Number of image context elements",
	minimum=1,
	maximum=10,
	step=1,
	interactive=True,
	value=2,
	)
	with gr.Row():
	with gr.Column():
	ret_images = gr.Gallery("Similar Images", columns=1, rows=2)
	with gr.Column():
	chatbot = gr.Chatbot(height=400)
	with gr.Accordion("Text References", open=False):
	with gr.Row():
	text_context = gr.Textbox(interactive=False, lines=10)

	with gr.Row():
	msg = gr.Textbox(
	placeholder="Type your question here (e.g. 'What is this document about?')",
	interactive=True,
	container=True,
	)
	with gr.Row():
	submit_btn = gr.Button("Submit message")
	clear_btn = gr.ClearButton([msg, chatbot], value="Clear conversation")

	pdf_btn.click(
	fn=extract_pdfs,
	inputs=[documents, doc_collection],
	outputs=[doc_collection, tabs, selected],
	)
	embed.click(
	extract_data_from_pdfs,
	inputs=[doc_collection, session_states],
	outputs=[
	vectordb,
	session_states,
	sample_data,
	ext_text,
	# ext_tables,
	images,
	prog,
	# image_desc
	],
	)

	submit_btn.click(
	conversation,
	[vectordb, msg, num_context, img_context, chatbot],
	[chatbot, text_context, ret_images],
	)

	# view_stats.click(
	# get_stats, [vectordb], outputs=[text_stats, table_stats, image_stats]
	# )

	# Page Navigation

	back_p1.click(lambda: gr.Tabs(selected=0), None, tabs)

	next_p1.click(check_validity_and_llm, session_states, tabs)
	if __name__ == "__main__":
	demo.launch(share=True)