Spaces:
Running
Running
File size: 1,567 Bytes
e70cddd |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 |
import pymupdf
from PIL import Image
import io
import gradio as gr
import pandas as pd
def image_to_bytes(image):
img_byte_arr = io.BytesIO()
image.save(img_byte_arr, format="PNG")
return base64.b64encode(img_byte_arr.getvalue()).decode("utf-8")
def extract_pdfs(docs, doc_collection):
if docs:
doc_collection = []
doc_collection.extend(docs)
return (
doc_collection,
gr.Tabs(selected=1),
pd.DataFrame([i.split("/")[-1] for i in list(docs)], columns=["Filename"]),
)
def extract_images(docs):
images = []
for doc_path in docs:
doc = pymupdf.open(doc_path) # open a document
for page_index in range(len(doc)): # iterate over pdf pages
page = doc[page_index] # get the page
image_list = page.get_images()
for image_index, img in enumerate(
image_list, start=1
): # enumerate the image list
xref = img[0] # get the XREF of the image
pix = pymupdf.Pixmap(doc, xref) # create a Pixmap
if pix.n - pix.alpha > 3: # CMYK: convert to RGB first
pix = pymupdf.Pixmap(pymupdf.csRGB, pix)
images.append(Image.open(io.BytesIO(pix.pil_tobytes("JPEG"))))
return images
def clean_text(text):
text = text.strip()
cleaned_text = text.replace("\n", " ")
cleaned_text = cleaned_text.replace("\t", " ")
cleaned_text = cleaned_text.replace(" ", " ")
cleaned_text = cleaned_text.strip()
return cleaned_text
|