Spaces:
Runtime error
Runtime error
| import easyocr | |
| from PIL import Image | |
| from io import BytesIO | |
| import pypdfium2 as pdfium | |
| def convert_pdf_to_images(file_path): | |
| pdf_file = pdfium.PdfDocument(file_path) | |
| page_indices = [i for i in range(len(pdf_file))] | |
| renderer = pdf_file.render( | |
| pdfium.PdfBitmap.to_pil, | |
| page_indices=page_indices, | |
| # scale=scale, | |
| ) | |
| list_final_images = [] | |
| for i, image in zip(page_indices, renderer): | |
| image_byte_array = BytesIO() | |
| image.save(image_byte_array, format='jpeg', optimize=True) | |
| image_byte_array = image_byte_array.getvalue() | |
| list_final_images.append(dict({i: image_byte_array})) | |
| return list_final_images | |
| def extract_text_with_easyocr(list_dict_final_images): | |
| language_reader = easyocr.Reader(['en']) | |
| image_list = [list(data.values())[0] for data in list_dict_final_images] | |
| image_content = [] | |
| for index, image_bytes in enumerate(image_list): | |
| image = Image.open(BytesIO(image_bytes)) | |
| raw_text = language_reader.readtext(image) | |
| raw_text = "\n".join([res[1] for res in raw_text]) | |
| image_content.append(raw_text) | |
| return "\n".join(image_content) |