Spaces:
Runtime error
Runtime error
# https://huggingface.co/spaces/Mishmosh/MichelleAssessment3 | |
#!pip install PyPDF2 | |
#!pip install sentencepiece | |
#!pip install pdfminer.six | |
#!pip install pdfplumber | |
#!pip install pdf2image | |
#!pip install Pillow | |
#!pip install pytesseract | |
# @title | |
!apt-get install poppler-utils | |
!apt install tesseract-ocr | |
!apt install libtesseract-dev | |
import PyPDF2 | |
from pdfminer.high_level import extract_pages, extract_text | |
from pdfminer.layout import LTTextContainer, LTChar, LTRect, LTFigure | |
import pdfplumber | |
from PIL import Image | |
from pdf2image import convert_from_path | |
import pytesseract | |
import os | |
def text_extraction(element): | |
# Extracting the text from the in-line text element | |
line_text = element.get_text() | |
# Find the formats of the text | |
# Initialize the list with all the formats that appeared in the line of text | |
line_formats = [] | |
for text_line in element: | |
if isinstance(text_line, LTTextContainer): | |
# Iterating through each character in the line of text | |
for character in text_line: | |
if isinstance(character, LTChar): | |
# Append the font name of the character | |
line_formats.append(character.fontname) | |
# Append the font size of the character | |
line_formats.append(character.size) | |
# Find the unique font sizes and names in the line | |
format_per_line = list(set(line_formats)) | |
# Return a tuple with the text in each line along with its format | |
return (line_text, format_per_line) | |
# @title | |
# Create a function to crop the image elements from PDFs | |
def crop_image(element, pageObj): | |
# Get the coordinates to crop the image from the PDF | |
[image_left, image_top, image_right, image_bottom] = [element.x0,element.y0,element.x1,element.y1] | |
# Crop the page using coordinates (left, bottom, right, top) | |
pageObj.mediabox.lower_left = (image_left, image_bottom) | |
pageObj.mediabox.upper_right = (image_right, image_top) | |
# Save the cropped page to a new PDF | |
cropped_pdf_writer = PyPDF2.PdfWriter() | |
cropped_pdf_writer.add_page(pageObj) | |
# Save the cropped PDF to a new file | |
with open('cropped_image.pdf', 'wb') as cropped_pdf_file: | |
cropped_pdf_writer.write(cropped_pdf_file) | |