Spaces:

zmbfeng
/

locked_pdf_ingestion

Sleeping

App Files Files Community

locked_pdf_ingestion / utils.py

zmbfeng

figures, tables, and other text book with above text recognized and below text included in the block text image

15cd602 6 months ago

raw

history blame

9.91 kB

	import subprocess
	import streamlit as st
	import cv2
	import numpy as np
	from PIL import Image
	import pytesseract
	def get_pdf_page_count(pdf_path):
	try:
	# Running pdfinfo command to get information about the PDF
	result = subprocess.run(['pdfinfo', pdf_path], stdout=subprocess.PIPE, text=True)
	# Parsing the output to find the line with the number of pages
	for line in result.stdout.split('\n'):
	if 'Pages:' in line:
	return int(line.split(':')[1].strip())
	except Exception as e:
	print(f"An error occurred: {e}")
	return None
	#configurable extract rectange rectangle size

	def extract_rectangle_from_image(gray, min_width, min_height):
	bounding_boxes = []
	#gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
	edges = cv2.Canny(gray, 50, 150, apertureSize=3)
	#edges = cv2.Canny(gray, 10, 200, apertureSize=3)
	kernel = np.ones((3,3), np.uint8)
	dilated_edges = cv2.dilate(edges, kernel, iterations=1)
	contours, _ = cv2.findContours(dilated_edges, cv2.RETR_TREE, cv2.CHAIN_APPROX_SIMPLE)
	#contours, _ = cv2.findContours(edges, cv2.RETR_TREE, cv2.CHAIN_APPROX_SIMPLE)


	index = 0
	for cnt in contours:
	approx = cv2.approxPolyDP(cnt, 0.01*cv2.arcLength(cnt, True), True)
	#approx = cv2.approxPolyDP(cnt, 0.1*cv2.arcLength(cnt, True), True)
	if len(approx) == 4: # Rectangle check
	x, y, w, h = cv2.boundingRect(approx)
	# print(f"x: {x}, y: {y}, w: {w}, h: {h}")
	if w >= min_width and h >= min_height:
	bounding_boxes.append((x, y, w, h))
	#print(x, y, w, h)
	return bounding_boxes
	def is_close(box1, box2, threshold=10):
	# Calculate the distance between the top-left corners of the two boxes
	distance = ((box1[0] - box2[0]) 2 + (box1[1] - box2[1]) 2) ** 0.5
	return distance < threshold
	def remove_close_boxes(boxes, threshold=10):
	kept_boxes = []
	for box in boxes:
	# Assume the box is not close to others by default
	is_close_to_others = False
	for kept_box in kept_boxes:
	if is_close(box, kept_box, threshold):
	is_close_to_others = True
	break
	# If the box is not close to any box we've kept, add it to the list of kept boxes
	if not is_close_to_others:
	kept_boxes.append(box)
	return kept_boxes
	def is_contained(box1, box2):
	"""
	Check if box1 is contained within box2.
	Each box is defined as (x, y, w, h).
	"""
	x1, y1, w1, h1 = box1
	x2, y2, w2, h2 = box2

	# Check if all corners of box1 are inside box2
	return x2 <= x1 and y2 <= y1 and x2 + w2 >= x1 + w1 and y2 + h2 >= y1 + h1

	def remove_contained_boxes(boxes):
	"""
	Remove boxes that are contained within other boxes.
	"""
	non_contained_boxes = []

	for i, box1 in enumerate(boxes):
	# Check if there's another box that contains box1
	if not any(is_contained(box1, box2) for j, box2 in enumerate(boxes) if i != j):
	non_contained_boxes.append(box1)

	return non_contained_boxes
	def draw_colored_boxes_on_image_np(image, boxes_list,color_tuple):
	for x, y, w, h in boxes_list:
	#x, y, w, h = box[0]
	cv2.rectangle(image, (x, y), (x + w, y + h), color_tuple, thickness=5)

	def is_filled_rectangle(image, rect, background_threshold=10, variance_threshold=0.1):

	x, y, w, h = rect
	roi = image[y+1:y+h-1, x+1:x+w-1]

	return np.all(roi == 0)
	def get_below_box(image_np, x, y,width,step=15):
	#print("x,y,width="+str(x)+","+str(y)+","+str(width))

	index_y = -1
	#print("get_below_box"+str(image_np.shape))
	if y+step < image_np.shape[0]:
	index_y = y
	while index_y+step < image_np.shape[0]:
	#print(str( np.all(image_np[index_y:index_y+step,x:x+width] == 255)))

	# image_np_copy = image_np.copy()
	# bgr_image = cv2.cvtColor(image_np_copy, cv2.COLOR_GRAY2BGR)
	# cv2.rectangle(bgr_image, (x, index_y), (x + width, index_y +step), color_tuple, thickness=5)
	# display_image_np(bgr_image)


	if np.all(image_np[index_y:index_y+step,x:x+width] == 255):
	# index_y += step
	break
	index_y += step
	return index_y
	def get_above_box(image_np, x, y,width,step=15):
	#print("x,y,width="+str(x)+","+str(y)+","+str(width))

	index_y = -1
	#print("get_below_box"+str(image_np.shape))
	if y-step > 0:
	index_y = y
	while index_y-step > 0:
	#print(str( np.all(image_np[index_y:index_y+step,x:x+width] == 255)))

	# image_np_copy = image_np.copy()
	# bgr_image = cv2.cvtColor(image_np_copy, cv2.COLOR_GRAY2BGR)
	# color_tuple=(0, 255, 0)
	# cv2.rectangle(bgr_image, (x, index_y-step), (x + width, index_y), color_tuple, thickness=5)
	# display_image_np(bgr_image)


	if np.all(image_np[index_y-step:index_y,x:x+width] == 255):
	# index_y += step
	break
	index_y -= step
	return index_y
	def is_note_rectangle(image_np, rect):
	x, y, w, h = rect
	roi = image_np[y+1:y+h-1, x+1:x+w-1]
	roi_converted = Image.fromarray(cv2.cvtColor(roi, cv2.COLOR_BGR2RGB))
	text = pytesseract.image_to_string(roi_converted)
	text = text.strip()
	note_str="note"
	print("is note text box="+str(text.lower().startswith(note_str.lower())))
	return text.lower().startswith(note_str.lower())
	def extract_bounding_boxes_from_image_np(image_np, bounding_boxes_list, above_check_offset, above_caption_offset, color_tuple):

	image_np_copy=image_np.copy()
	rect_content_list=[]
	above_rect_content_list=[]
	figures_image_list=[]
	tables_image_list=[]
	index = 0
	for box in bounding_boxes_list:
	x, y, w, h = box
	if not is_filled_rectangle(image_np_copy, box):
	# print("box="+str(box)+"not filled")
	y_index= get_below_box(image_np, x, y+h,w)
	if y_index == -1 or is_note_rectangle(image_np_copy, box):
	# print("below text not found")
	rect_content =image_np[y:y+h, x:x+w]
	# rect_content_list.append(rect_content)
	cv2.rectangle(image_np_copy, (x, y), (x+w, y+h), color_tuple, cv2.FILLED)
	else:
	# print("below text found")
	rect_content =image_np[y:y_index, x:x+w]
	# rect_content_list.append(rect_content)
	cv2.rectangle(image_np_copy, (x, y), (x+w, y_index), color_tuple, cv2.FILLED)

	cv2.rectangle(image_np_copy, (x, y), (x+w, y+h), color_tuple, cv2.FILLED)

	above_box_y= get_above_box(image_np, x, y,w)
	if above_box_y == -1 or above_box_y == y:
	# print("box="+str(box)+"no above box")
	above_rect_content_list.append(None)
	rect_content_list.append(rect_content)
	else:
	# print("box="+str(box)+"above box exist")
	above_rect_content = image_np[above_box_y:y, x:x+w]
	# above_rect_content_list.append(above_rect_content)
	above_converted = Image.fromarray(cv2.cvtColor(above_rect_content, cv2.COLOR_BGR2RGB))
	text = pytesseract.image_to_string(above_converted)
	text = text.strip()
	figure_str ="Figure"
	table_str ="Table"
	if text.lower().startswith(figure_str.lower()):
	print(text)
	figures_image_list.append((text,rect_content))

	elif text.lower().startswith(table_str.lower()):
	print(text)
	tables_image_list.append((text,rect_content))
	else:
	above_rect_content_list.append((text, rect_content))
	rect_content_list.append(rect_content)

	cv2.rectangle(image_np_copy, (x, above_box_y), (x+w, y), color_tuple, cv2.FILLED)
	# above_rect_content = image_np[y-above_check_offset:y, x:x+w]
	# if np.all(above_rect_content == 255):
	# # print("box="+str(box)+"above all white")
	# above_rect_content_list.append(None)
	# else:
	# # print("box="+str(box)+"above not all white")
	# above_rect_content = image_np[y-above_caption_offset:y, x:x+w]
	# above_rect_content_list.append(above_rect_content)
	# cv2.rectangle(image_np_copy, (x, y), (x+w, y-above_caption_offset), color_tuple, cv2.FILLED)

	index += 1
	# else:
	# print("box="+str(box)+"filled")
	return rect_content_list,above_rect_content_list, figures_image_list, tables_image_list, image_np_copy
	def gray_pdf_image_np_to_text(image_index,gray_pdf_image_np, debug=False):
	bounding_boxes_list = extract_rectangle_from_image(gray_pdf_image_np, 500, 20)
	bounding_boxes_list = remove_close_boxes (bounding_boxes_list, 10)
	bounding_boxes_list = remove_contained_boxes(bounding_boxes_list)
	if debug:
	bgr_image = cv2.cvtColor(gray_pdf_image_np, cv2.COLOR_GRAY2BGR)
	color_tuple = (0, 255, 0)
	draw_colored_boxes_on_image_np(bgr_image, bounding_boxes_list, color_tuple)
	# st.image(Image.fromarray(bgr_image)) #to_be_displayed

	text_box_list, above_test_box_list,figures_image_list,tables_image_list, cropped_image = extract_bounding_boxes_from_image_np(gray_pdf_image_np,
	bounding_boxes_list, 30,
	50, (255, 255, 255))
	if debug:
	debug_text_box_index = 0
	for text_box, above_text_box in zip(text_box_list, above_test_box_list):
	print("text box start")
	if above_text_box is not None:
	st.write(above_text_box[0])
	st.image(Image.fromarray(above_text_box[1]))
	# st.write(text)
	st.image(Image.fromarray(text_box))
	debug_text_box_index = debug_text_box_index + 1
	for figure in figures_image_list:
	st.write(figure[0])
	st.image(Image.fromarray(figure[1]))
	for table in tables_image_list:
	st.write(table[0])
	st.image(Image.fromarray(table[1]))