ask-your-pdf / ocr.py
dj9801's picture
removed gemini API and improved the UI
ac83258
raw
history blame
No virus
1.17 kB
import easyocr
from PIL import Image
from io import BytesIO
import pypdfium2 as pdfium
def convert_pdf_to_images(file_path):
pdf_file = pdfium.PdfDocument(file_path)
page_indices = [i for i in range(len(pdf_file))]
renderer = pdf_file.render(
pdfium.PdfBitmap.to_pil,
page_indices=page_indices,
# scale=scale,
)
list_final_images = []
for i, image in zip(page_indices, renderer):
image_byte_array = BytesIO()
image.save(image_byte_array, format='jpeg', optimize=True)
image_byte_array = image_byte_array.getvalue()
list_final_images.append(dict({i: image_byte_array}))
return list_final_images
def extract_text_with_easyocr(list_dict_final_images):
language_reader = easyocr.Reader(['en'])
image_list = [list(data.values())[0] for data in list_dict_final_images]
image_content = []
for index, image_bytes in enumerate(image_list):
image = Image.open(BytesIO(image_bytes))
raw_text = language_reader.readtext(image)
raw_text = "\n".join([res[1] for res in raw_text])
image_content.append(raw_text)
return "\n".join(image_content)