Spaces:

Mishmosh
/

MichelleAssessment3

Runtime error

App Files Files Community

MichelleAssessment3 / app.py

Mishmosh

Update app.py

edc3a03 12 months ago

raw

history blame

2.26 kB

	# https://huggingface.co/spaces/Mishmosh/MichelleAssessment3
	#!pip install PyPDF2
	#!pip install sentencepiece
	#!pip install pdfminer.six
	#!pip install pdfplumber
	#!pip install pdf2image
	#!pip install Pillow
	#!pip install pytesseract
	# @title
	!apt-get install poppler-utils
	!apt install tesseract-ocr
	!apt install libtesseract-dev
	import PyPDF2
	from pdfminer.high_level import extract_pages, extract_text
	from pdfminer.layout import LTTextContainer, LTChar, LTRect, LTFigure
	import pdfplumber
	from PIL import Image
	from pdf2image import convert_from_path
	import pytesseract
	import os
	def text_extraction(element):
	# Extracting the text from the in-line text element
	line_text = element.get_text()

	# Find the formats of the text
	# Initialize the list with all the formats that appeared in the line of text
	line_formats = []
	for text_line in element:
	if isinstance(text_line, LTTextContainer):
	# Iterating through each character in the line of text
	for character in text_line:
	if isinstance(character, LTChar):
	# Append the font name of the character
	line_formats.append(character.fontname)
	# Append the font size of the character
	line_formats.append(character.size)
	# Find the unique font sizes and names in the line
	format_per_line = list(set(line_formats))

	# Return a tuple with the text in each line along with its format
	return (line_text, format_per_line)
	# @title
	# Create a function to crop the image elements from PDFs
	def crop_image(element, pageObj):
	# Get the coordinates to crop the image from the PDF
	[image_left, image_top, image_right, image_bottom] = [element.x0,element.y0,element.x1,element.y1]
	# Crop the page using coordinates (left, bottom, right, top)
	pageObj.mediabox.lower_left = (image_left, image_bottom)
	pageObj.mediabox.upper_right = (image_right, image_top)
	# Save the cropped page to a new PDF
	cropped_pdf_writer = PyPDF2.PdfWriter()
	cropped_pdf_writer.add_page(pageObj)
	# Save the cropped PDF to a new file
	with open('cropped_image.pdf', 'wb') as cropped_pdf_file:
	cropped_pdf_writer.write(cropped_pdf_file)