import easyocr from PIL import Image from io import BytesIO import pypdfium2 as pdfium def convert_pdf_to_images(file_path): pdf_file = pdfium.PdfDocument(file_path) page_indices = [i for i in range(len(pdf_file))] renderer = pdf_file.render( pdfium.PdfBitmap.to_pil, page_indices=page_indices, # scale=scale, ) list_final_images = [] for i, image in zip(page_indices, renderer): image_byte_array = BytesIO() image.save(image_byte_array, format='jpeg', optimize=True) image_byte_array = image_byte_array.getvalue() list_final_images.append(dict({i: image_byte_array})) return list_final_images def extract_text_with_easyocr(list_dict_final_images): language_reader = easyocr.Reader(['en']) image_list = [list(data.values())[0] for data in list_dict_final_images] image_content = [] for index, image_bytes in enumerate(image_list): image = Image.open(BytesIO(image_bytes)) raw_text = language_reader.readtext(image) raw_text = "\n".join([res[1] for res in raw_text]) image_content.append(raw_text) return "\n".join(image_content)