Spaces:

anand004
/

Multimodal-PDF-RAG

Runtime error

Multimodal-PDF-RAG / utils.py

bug fixes, faster ocr and restructure

e70cddd unverified 6 months ago

1.57 kB

	import pymupdf
	from PIL import Image
	import io
	import gradio as gr
	import pandas as pd


	def image_to_bytes(image):
	img_byte_arr = io.BytesIO()
	image.save(img_byte_arr, format="PNG")
	return base64.b64encode(img_byte_arr.getvalue()).decode("utf-8")


	def extract_pdfs(docs, doc_collection):
	if docs:
	doc_collection = []
	doc_collection.extend(docs)
	return (
	doc_collection,
	gr.Tabs(selected=1),
	pd.DataFrame([i.split("/")[-1] for i in list(docs)], columns=["Filename"]),
	)


	def extract_images(docs):
	images = []
	for doc_path in docs:
	doc = pymupdf.open(doc_path) # open a document

	for page_index in range(len(doc)): # iterate over pdf pages
	page = doc[page_index] # get the page
	image_list = page.get_images()

	for image_index, img in enumerate(
	image_list, start=1
	): # enumerate the image list
	xref = img[0] # get the XREF of the image
	pix = pymupdf.Pixmap(doc, xref) # create a Pixmap

	if pix.n - pix.alpha > 3: # CMYK: convert to RGB first
	pix = pymupdf.Pixmap(pymupdf.csRGB, pix)

	images.append(Image.open(io.BytesIO(pix.pil_tobytes("JPEG"))))
	return images


	def clean_text(text):
	text = text.strip()
	cleaned_text = text.replace("\n", " ")
	cleaned_text = cleaned_text.replace("\t", " ")
	cleaned_text = cleaned_text.replace(" ", " ")
	cleaned_text = cleaned_text.strip()
	return cleaned_text