table_test / post_processing_v2 (1).py

Upload post_processing_v2 (1).py

0538136 verified 7 months ago

22.6 kB

	import json
	import pandas as pd

	def read_json(json_file):
	with open(json_file, 'r', encoding='utf-8') as file:
	return json.load(file)

	def adjust_page_dimensions_and_bbox(modified_model_output_json, pdfminer_json):
	for page_number, blocks in modified_model_output_json.items():

	if page_number in pdfminer_json.keys():
	if pdfminer_json[page_number]:

	page_info = pdfminer_json[page_number][0]
	page_width = page_info['page_width']
	page_height = page_info['page_height']
	for block in blocks:
	original_width = block['page_img_width']
	original_height = block['page_img_height']
	width_scale = page_width / original_width
	height_scale = page_height / original_height
	block['page_img_width'] = page_width
	block['page_img_height'] = page_height
	block['bbox'] = [
	block['bbox'][0] * width_scale,
	block['bbox'][1] * height_scale,
	block['bbox'][2] * width_scale,
	block['bbox'][3] * height_scale
	]

	else:
	print(f"Page {page_number} is empty.")
	return modified_model_output_json

	def convert_to_dataframe(extracted_df):
	if isinstance(extracted_df, pd.DataFrame):
	return extracted_df

	elif isinstance(extracted_df, dict):
	if all(isinstance(value, list) for value in extracted_df.values()):
	return pd.DataFrame(extracted_df)
	else:
	return pd.DataFrame([extracted_df])

	elif isinstance(extracted_df, list):
	if all(isinstance(item, dict) for item in extracted_df):
	return pd.DataFrame(extracted_df)
	else:
	return pd.DataFrame(extracted_df, columns=['Value'])

	else:
	return pd.DataFrame([extracted_df], columns=['Value'])

	def calculate_centroid(bbox):
	x1, y1, x2, y2 = bbox
	x_center = (x1 + x2) / 2
	y_center = (y1 + y2) / 2
	return (x_center, y_center)

	def is_within_radius(text_block_bbox, header_bbox, radius=50):
	text_xmin, text_ymin, text_xmax, text_ymax = text_block_bbox
	header_xmin, header_ymin, header_xmax, header_ymax = header_bbox

	# Check for overlap between text_block_bbox and header_bbox
	overlap_x = max(0, min(text_xmax, header_xmax) - max(text_xmin, header_xmin))
	overlap_y = max(0, min(text_ymax, header_ymax) - max(text_ymin, header_ymin))

	# If there is any overlap, return True
	if overlap_x > 0 and overlap_y > 0:
	return True

	return False

	def is_overlapped(text_block_bbox, header_bbox, threshold=0.20):
	# Unpack bounding boxes
	text_xmin, text_ymin, text_xmax, text_ymax = text_block_bbox
	header_xmin, header_ymin, header_xmax, header_ymax = header_bbox

	# Calculate overlap in the x and y directions
	overlap_x = max(0, min(text_xmax, header_xmax) - max(text_xmin, header_xmin))
	overlap_y = max(0, min(text_ymax, header_ymax) - max(text_ymin, header_ymin))

	# Calculate the area of overlap
	overlap_area = overlap_x * overlap_y

	# Calculate the area of the text block and header
	text_area = (text_xmax - text_xmin) * (text_ymax - text_ymin)
	header_area = (header_xmax - header_xmin) * (header_ymax - header_ymin)

	# Calculate the overlap ratio with respect to the smaller of the two areas
	smaller_area = min(text_area, header_area)
	overlap_ratio = overlap_area / smaller_area

	# Check if the overlap ratio exceeds the threshold
	if overlap_ratio > threshold:
	return True

	return False

	def detect_header(text_block_bbox, adjusted_model_output_json, page_number ,next_header_index_in_model_udop):
	text_centroid = calculate_centroid(text_block_bbox)
	if str(page_number) in adjusted_model_output_json:
	if next_header_index_in_model_udop is not None :
	next_header_index_in_model_udop = int(next_header_index_in_model_udop)
	header_block = adjusted_model_output_json[str(page_number)][next_header_index_in_model_udop]
	if is_overlapped(text_block_bbox, header_block['bbox']):
	return True
	return False

	def remove_header_from_start(first_row_text: str, first_row_header_text: str) -> str:
	length_header_text = len(first_row_header_text)
	return first_row_text[length_header_text:].strip()

	def extract_last_header_index(all_blocks_with_indices):
	last_header_index = -1

	# Iterate through the list in reverse
	for index in reversed(range(len(all_blocks_with_indices))):
	block = all_blocks_with_indices[index]

	# Check if the block is a Page-header or Section-header
	if block['label_name'] in ['Page-header', 'Section-header']:
	last_header_index = index
	break

	return last_header_index

	def match_headers_with_text(adjusted_model_json, pdfminer_json):
	matched_data = []
	tree_format_matched_data = []
	current_header = None
	current_content = []
	current_header_table_content = []
	current_header_tree_structure = []
	sorted_pages = sorted(adjusted_model_json.items(), key=lambda x: int(x[0]))

	all_blocks_with_indices = []
	for key, blocks in sorted_pages:
	for index, block in enumerate(blocks):
	if block['label_name'] in ['Page-header','Section-header','Table', "Portfolio-Company-Table"]:
	block['used_model_index'] = index
	all_blocks_with_indices.append(block)


	for id,block in enumerate(all_blocks_with_indices):
	if block['label_name'] in ['Page-header','Section-header']:
	next_header_detect_flag = False
	current_header_index_in_model = block['used_model_index']
	current_header_bbox = block['bbox']
	current_header_type = block['label_name']
	current_header_centroid = calculate_centroid(block['bbox'])
	current_header_page_number = block['pdf_page_id']
	current_header_text = block['extracted_text'][0] if block['extracted_text'] else ""
	current_header_page_width = block['page_img_width']
	current_header_page_height = block['page_img_height']
	current_header_page_block_id = block['page_block_id']
	current_header_pdf_name = block['pdf_name']
	content_source_pages = [] # Track pages where content is collected
	new_start_index = id + 1
	if new_start_index < len(all_blocks_with_indices):
	for next_id ,next_block in enumerate(all_blocks_with_indices[new_start_index:], start = new_start_index):
	if next_block['label_name'] in ['Page-header', 'Section-header']:
	next_header_index_in_model_udop = next_block['used_model_index']
	next_header_bbox = next_block['bbox']
	next_header_centroid = calculate_centroid(next_block['bbox'])
	next_header_page_number = next_block["pdf_page_id"]
	next_header_text = next_block['extracted_text'][0] if next_block['extracted_text'] else ""
	break

	else:
	next_header_bbox = None
	next_header_centroid = None
	next_header_page_number = None
	next_header_index_in_model_udop = None
	next_header_text = None

	last_header_index = extract_last_header_index(all_blocks_with_indices)
	if id == len(all_blocks_with_indices) - 1 or id == last_header_index:
	next_header_bbox = None
	next_header_centroid = None
	next_header_page_number = None
	next_header_index_in_model_udop = None
	next_header_text = None

	if current_header_text:
	if current_header is not None:
	current_content = []
	current_header_table_content = []
	current_header_tree_structure = []

	current_header = {
	"page_number": current_header_page_number,
	"header_text": current_header_text,
	"element_id": None,
	"text_block_id": None
	}
	new_start_index = id + 1
	for new_id,new_block in enumerate(all_blocks_with_indices[new_start_index:], start = new_start_index):
	extracted_df_flag = False
	next_block = new_block
	if next_block and next_block['label_name'] in ['Page-header', 'Section-header']:
	extracted_df_flag = False
	break

	# if next_block and next_block['label_name'] in ['Table']:
	if next_block and next_block['label_name'] in ['Table', "Portfolio-Company-Table"]:
	extracted_df_flag = True
	extracted_df = next_block['extracted_text'][0]
	if next_block["associated_table_header_info"] is not None:
	extracted_df_table_header = next_block["associated_table_header_info"]['extracted_text'][0]
	else:
	extracted_df_table_header = None

	extracted_df_new = convert_to_dataframe(extracted_df)
	extracted_df_new_column_headers = extracted_df_new.columns.tolist()
	extracted_df_markdown = extracted_df_new.to_csv(index=False)

	table_metadata = { 'pdf_name': next_block['pdf_name'] ,
	'table_page_id': next_block['pdf_page_id'],
	'table_page_id_width' : next_block['page_img_width'],
	'table_page_id_height': next_block['page_img_height'],
	'table_bbox' : next_block['bbox']
	}

	table_header_pair = {
	# 'label_name':'Table-header',
	'label_name':next_block['label_name'],
	'table_header': extracted_df_table_header,
	'table_column_header' : extracted_df_new_column_headers,
	'table_info': extracted_df_new,
	'metadata' : table_metadata
	}

	tree_table_header_info = {
	'label_name':'Table-header',
	# 'label_name':next_block['label_name'],
	'table_header_info': next_block["associated_table_header_info"],
	'table_column_header' : extracted_df_new_column_headers,
	'table_info': next_block
	}

	# current_header_table_content.append(extracted_df)
	current_header_table_content.append(table_header_pair)
	current_header_tree_structure.append(next_block)

	last_pdf_page = int(list(pdfminer_json.keys())[-1])
	first_append_flag = False
	first_append_text = " "
	for pdf_page_num in range(int(current_header_page_number), last_pdf_page + 1):
	text_blocks = pdfminer_json.get(str(pdf_page_num), [])
	start_index = 0
	page_content_added = False # Track if content was added from this page
	if current_header["element_id"] is None and current_header["text_block_id"] is None:
	for index, text_block in enumerate(text_blocks):
	if is_overlapped(text_block['bbox'],current_header_bbox):
	current_header["element_id"] = text_block["element_id"]
	current_header["text_block_id"] = text_block["text_block_id"]
	start_index = index
	first_append_flag = True
	break

	for next_header_index, text_block in enumerate(text_blocks[start_index:], start = start_index):
	last_text_reached_flag = False
	if first_append_flag:
	first_row_text = text_block['text']
	first_row_header_text = current_header_text
	first_append_text = remove_header_from_start(first_row_text,first_row_header_text)
	current_content.append(first_append_text)
	page_content_added = True
	first_append_flag = False
	continue

	if next_header_text is not None and pdf_page_num == int(next_header_page_number):
	next_header_found_flag = False

	if detect_header(text_block['bbox'], adjusted_model_json, next_header_page_number,next_header_index_in_model_udop):
	next_header_found_flag = True
	matched_data.append({
	"page_number": current_header["page_number"],
	"pdf_name" : current_header_pdf_name ,
	"header": current_header["header_text"],
	"label_name": current_header_type,
	"content": " ".join(current_content),
	"table_content" : current_header_table_content,
	"all_source_pages": content_source_pages
	})
	tree_format_matched_data.append({
	"header_page_number": current_header["page_number"],
	"label_name":current_header_type,
	'page_block_id' : current_header_page_block_id,
	"header_bbox": current_header_bbox,
	"header_page_width":current_header_page_width,
	"header_page_height": current_header_page_height,
	"header": current_header["header_text"],
	"content": " ".join(current_content),
	'tree_table_content' : current_header_tree_structure
	})
	current_content = []
	current_table_content = []
	current_header_tree_structure = []
	next_header_detect_flag = True
	break

	if next_header_index == len(text_blocks) - 1:
	last_text_block = text_block
	if not next_header_found_flag and last_text_block:
	matched_data.append({
	"page_number": current_header["page_number"],
	"pdf_name" : current_header_pdf_name ,
	"header": current_header["header_text"],
	"label_name": current_header_type,
	"content": " ".join(current_content),
	"table_content" : current_header_table_content,
	"all_source_pages": content_source_pages
	})
	tree_format_matched_data.append({
	"header_page_number": current_header["page_number"],
	"label_name":currentHeaderType,
	'page_block_id' : current_header_page_block_id,
	"header_bbox": current_header_bbox,
	"header_page_width":current_header_page_width,
	"header_page_height": current_header_page_height,
	"header": current_header["header_text"],
	"content": " ".join(current_content),
	'tree_table_content' : current_header_tree_structure
	})
	current_content = []
	current_header_table_content = []
	current_header_tree_structure = []
	next_header_detect_flag = True
	next_header_found_flag = True
	break

	current_content.append(text_block['text'])
	page_content_added = True
	if next_header_detect_flag:
	break

	# Add page number to source pages if content was added from this page
	if page_content_added and pdf_page_num not in content_source_pages:
	content_source_pages.append(pdf_page_num)

	if next_header_detect_flag:
	break

	if next_header_text is None and next_header_page_number is None:
	current_header = {
	"page_number": current_header_page_number,
	"header_text": current_header_text,
	"element_id": None,
	"text_block_id": None
	}

	for pdf_page_num in range(int(current_header_page_number), last_pdf_page + 1):
	text_blocks = pdfminer_json.get(str(pdf_page_num), [])
	start_index = 0
	page_content_added = False # Track if content was added from this page
	if current_header["element_id"] is None and current_header["text_block_id"] is None:
	for index, text_block in enumerate(text_blocks):
	if is_overlapped(text_block['bbox'],current_header_bbox):
	current_header["element_id"] = text_block["element_id"]
	current_header["text_block_id"] = text_block["text_block_id"]
	start_index = index
	first_append_flag = True
	break

	for no_header_index, text_block in enumerate(text_blocks[start_index:], start=start_index):
	if first_append_flag:
	first_row_text = text_block['text']
	first_row_header_text = current_header_text
	first_append_text = remove_header_from_start(first_row_text,first_row_header_text)
	current_content.append(first_append_text)
	page_content_added = True
	first_append_flag = False
	continue

	# Add page number to source pages if content was added from this page
	if page_content_added and pdf_page_num not in content_source_pages:
	content_source_pages.append(pdf_page_num)

	matched_data.append({
	"page_number": current_header["page_number"],
	"pdf_name" : current_header_pdf_name ,
	"header": current_header["header_text"],
	"label_name": current_header_type,
	"content": " ".join(current_content),
	"table_content" : current_header_table_content,
	"all_source_pages": content_source_pages
	})
	tree_format_matched_data.append({
	"header_page_number": current_header["page_number"],
	"label_name": current_header_type,
	'page_block_id' : current_header_page_block_id,
	"header_bbox": current_header_bbox,
	"header_page_width":current_header_page_width,
	"header_page_height": current_header_page_height,
	"header": current_header["header_text"],
	"content": " ".join(current_content),
	'tree_table_content' : current_header_tree_structure
	})

	return matched_data,tree_format_matched_data

	def main_header_pipeline(modified_udop_json, pdfminer_json):
	modified_udop_json = adjust_page_dimensions_and_bbox(modified_udop_json, pdfminer_json)
	matched_data,tree_format_matched_data= match_headers_with_text(modified_udop_json, pdfminer_json)
	df = pd.DataFrame(matched_data)
	return df,tree_format_matched_data