Instructions to use Kushalguptaiitb/table_test with libraries, inference providers, notebooks, and local apps. Follow these links to get started.
- Libraries
- Transformers
How to use Kushalguptaiitb/table_test with Transformers:
# Use a pipeline as a high-level helper from transformers import pipeline pipe = pipeline("object-detection", model="Kushalguptaiitb/table_test")# Load model directly from transformers import AutoImageProcessor, AutoModelForObjectDetection processor = AutoImageProcessor.from_pretrained("Kushalguptaiitb/table_test") model = AutoModelForObjectDetection.from_pretrained("Kushalguptaiitb/table_test") - Notebooks
- Google Colab
- Kaggle
| import json | |
| import pandas as pd | |
| def read_json(json_file): | |
| with open(json_file, 'r', encoding='utf-8') as file: | |
| return json.load(file) | |
| def adjust_page_dimensions_and_bbox(modified_model_output_json, pdfminer_json): | |
| for page_number, blocks in modified_model_output_json.items(): | |
| if page_number in pdfminer_json.keys(): | |
| if pdfminer_json[page_number]: | |
| page_info = pdfminer_json[page_number][0] | |
| page_width = page_info['page_width'] | |
| page_height = page_info['page_height'] | |
| for block in blocks: | |
| original_width = block['page_img_width'] | |
| original_height = block['page_img_height'] | |
| width_scale = page_width / original_width | |
| height_scale = page_height / original_height | |
| block['page_img_width'] = page_width | |
| block['page_img_height'] = page_height | |
| block['bbox'] = [ | |
| block['bbox'][0] * width_scale, | |
| block['bbox'][1] * height_scale, | |
| block['bbox'][2] * width_scale, | |
| block['bbox'][3] * height_scale | |
| ] | |
| else: | |
| print(f"Page {page_number} is empty.") | |
| return modified_model_output_json | |
| def convert_to_dataframe(extracted_df): | |
| if isinstance(extracted_df, pd.DataFrame): | |
| return extracted_df | |
| elif isinstance(extracted_df, dict): | |
| if all(isinstance(value, list) for value in extracted_df.values()): | |
| return pd.DataFrame(extracted_df) | |
| else: | |
| return pd.DataFrame([extracted_df]) | |
| elif isinstance(extracted_df, list): | |
| if all(isinstance(item, dict) for item in extracted_df): | |
| return pd.DataFrame(extracted_df) | |
| else: | |
| return pd.DataFrame(extracted_df, columns=['Value']) | |
| else: | |
| return pd.DataFrame([extracted_df], columns=['Value']) | |
| def calculate_centroid(bbox): | |
| x1, y1, x2, y2 = bbox | |
| x_center = (x1 + x2) / 2 | |
| y_center = (y1 + y2) / 2 | |
| return (x_center, y_center) | |
| def is_within_radius(text_block_bbox, header_bbox, radius=50): | |
| text_xmin, text_ymin, text_xmax, text_ymax = text_block_bbox | |
| header_xmin, header_ymin, header_xmax, header_ymax = header_bbox | |
| # Check for overlap between text_block_bbox and header_bbox | |
| overlap_x = max(0, min(text_xmax, header_xmax) - max(text_xmin, header_xmin)) | |
| overlap_y = max(0, min(text_ymax, header_ymax) - max(text_ymin, header_ymin)) | |
| # If there is any overlap, return True | |
| if overlap_x > 0 and overlap_y > 0: | |
| return True | |
| return False | |
| def is_overlapped(text_block_bbox, header_bbox, threshold=0.20): | |
| # Unpack bounding boxes | |
| text_xmin, text_ymin, text_xmax, text_ymax = text_block_bbox | |
| header_xmin, header_ymin, header_xmax, header_ymax = header_bbox | |
| # Calculate overlap in the x and y directions | |
| overlap_x = max(0, min(text_xmax, header_xmax) - max(text_xmin, header_xmin)) | |
| overlap_y = max(0, min(text_ymax, header_ymax) - max(text_ymin, header_ymin)) | |
| # Calculate the area of overlap | |
| overlap_area = overlap_x * overlap_y | |
| # Calculate the area of the text block and header | |
| text_area = (text_xmax - text_xmin) * (text_ymax - text_ymin) | |
| header_area = (header_xmax - header_xmin) * (header_ymax - header_ymin) | |
| # Calculate the overlap ratio with respect to the smaller of the two areas | |
| smaller_area = min(text_area, header_area) | |
| overlap_ratio = overlap_area / smaller_area | |
| # Check if the overlap ratio exceeds the threshold | |
| if overlap_ratio > threshold: | |
| return True | |
| return False | |
| def detect_header(text_block_bbox, adjusted_model_output_json, page_number ,next_header_index_in_model_udop): | |
| text_centroid = calculate_centroid(text_block_bbox) | |
| if str(page_number) in adjusted_model_output_json: | |
| if next_header_index_in_model_udop is not None : | |
| next_header_index_in_model_udop = int(next_header_index_in_model_udop) | |
| header_block = adjusted_model_output_json[str(page_number)][next_header_index_in_model_udop] | |
| if is_overlapped(text_block_bbox, header_block['bbox']): | |
| return True | |
| return False | |
| def remove_header_from_start(first_row_text: str, first_row_header_text: str) -> str: | |
| length_header_text = len(first_row_header_text) | |
| return first_row_text[length_header_text:].strip() | |
| def extract_last_header_index(all_blocks_with_indices): | |
| last_header_index = -1 | |
| # Iterate through the list in reverse | |
| for index in reversed(range(len(all_blocks_with_indices))): | |
| block = all_blocks_with_indices[index] | |
| # Check if the block is a Page-header or Section-header | |
| if block['label_name'] in ['Page-header', 'Section-header']: | |
| last_header_index = index | |
| break | |
| return last_header_index | |
| def match_headers_with_text(adjusted_model_json, pdfminer_json): | |
| matched_data = [] | |
| tree_format_matched_data = [] | |
| current_header = None | |
| current_content = [] | |
| current_header_table_content = [] | |
| current_header_tree_structure = [] | |
| sorted_pages = sorted(adjusted_model_json.items(), key=lambda x: int(x[0])) | |
| all_blocks_with_indices = [] | |
| for key, blocks in sorted_pages: | |
| for index, block in enumerate(blocks): | |
| if block['label_name'] in ['Page-header','Section-header','Table', "Portfolio-Company-Table"]: | |
| block['used_model_index'] = index | |
| all_blocks_with_indices.append(block) | |
| for id,block in enumerate(all_blocks_with_indices): | |
| if block['label_name'] in ['Page-header','Section-header']: | |
| next_header_detect_flag = False | |
| current_header_index_in_model = block['used_model_index'] | |
| current_header_bbox = block['bbox'] | |
| current_header_type = block['label_name'] | |
| current_header_centroid = calculate_centroid(block['bbox']) | |
| current_header_page_number = block['pdf_page_id'] | |
| current_header_text = block['extracted_text'][0] if block['extracted_text'] else "" | |
| current_header_page_width = block['page_img_width'] | |
| current_header_page_height = block['page_img_height'] | |
| current_header_page_block_id = block['page_block_id'] | |
| current_header_pdf_name = block['pdf_name'] | |
| content_source_pages = [] # Track pages where content is collected | |
| new_start_index = id + 1 | |
| if new_start_index < len(all_blocks_with_indices): | |
| for next_id ,next_block in enumerate(all_blocks_with_indices[new_start_index:], start = new_start_index): | |
| if next_block['label_name'] in ['Page-header', 'Section-header']: | |
| next_header_index_in_model_udop = next_block['used_model_index'] | |
| next_header_bbox = next_block['bbox'] | |
| next_header_centroid = calculate_centroid(next_block['bbox']) | |
| next_header_page_number = next_block["pdf_page_id"] | |
| next_header_text = next_block['extracted_text'][0] if next_block['extracted_text'] else "" | |
| break | |
| else: | |
| next_header_bbox = None | |
| next_header_centroid = None | |
| next_header_page_number = None | |
| next_header_index_in_model_udop = None | |
| next_header_text = None | |
| last_header_index = extract_last_header_index(all_blocks_with_indices) | |
| if id == len(all_blocks_with_indices) - 1 or id == last_header_index: | |
| next_header_bbox = None | |
| next_header_centroid = None | |
| next_header_page_number = None | |
| next_header_index_in_model_udop = None | |
| next_header_text = None | |
| if current_header_text: | |
| if current_header is not None: | |
| current_content = [] | |
| current_header_table_content = [] | |
| current_header_tree_structure = [] | |
| current_header = { | |
| "page_number": current_header_page_number, | |
| "header_text": current_header_text, | |
| "element_id": None, | |
| "text_block_id": None | |
| } | |
| new_start_index = id + 1 | |
| for new_id,new_block in enumerate(all_blocks_with_indices[new_start_index:], start = new_start_index): | |
| extracted_df_flag = False | |
| next_block = new_block | |
| if next_block and next_block['label_name'] in ['Page-header', 'Section-header']: | |
| extracted_df_flag = False | |
| break | |
| # if next_block and next_block['label_name'] in ['Table']: | |
| if next_block and next_block['label_name'] in ['Table', "Portfolio-Company-Table"]: | |
| extracted_df_flag = True | |
| extracted_df = next_block['extracted_text'][0] | |
| if next_block["associated_table_header_info"] is not None: | |
| extracted_df_table_header = next_block["associated_table_header_info"]['extracted_text'][0] | |
| else: | |
| extracted_df_table_header = None | |
| extracted_df_new = convert_to_dataframe(extracted_df) | |
| extracted_df_new_column_headers = extracted_df_new.columns.tolist() | |
| extracted_df_markdown = extracted_df_new.to_csv(index=False) | |
| table_metadata = { 'pdf_name': next_block['pdf_name'] , | |
| 'table_page_id': next_block['pdf_page_id'], | |
| 'table_page_id_width' : next_block['page_img_width'], | |
| 'table_page_id_height': next_block['page_img_height'], | |
| 'table_bbox' : next_block['bbox'] | |
| } | |
| table_header_pair = { | |
| # 'label_name':'Table-header', | |
| 'label_name':next_block['label_name'], | |
| 'table_header': extracted_df_table_header, | |
| 'table_column_header' : extracted_df_new_column_headers, | |
| 'table_info': extracted_df_new, | |
| 'metadata' : table_metadata | |
| } | |
| tree_table_header_info = { | |
| 'label_name':'Table-header', | |
| # 'label_name':next_block['label_name'], | |
| 'table_header_info': next_block["associated_table_header_info"], | |
| 'table_column_header' : extracted_df_new_column_headers, | |
| 'table_info': next_block | |
| } | |
| # current_header_table_content.append(extracted_df) | |
| current_header_table_content.append(table_header_pair) | |
| current_header_tree_structure.append(next_block) | |
| last_pdf_page = int(list(pdfminer_json.keys())[-1]) | |
| first_append_flag = False | |
| first_append_text = " " | |
| for pdf_page_num in range(int(current_header_page_number), last_pdf_page + 1): | |
| text_blocks = pdfminer_json.get(str(pdf_page_num), []) | |
| start_index = 0 | |
| page_content_added = False # Track if content was added from this page | |
| if current_header["element_id"] is None and current_header["text_block_id"] is None: | |
| for index, text_block in enumerate(text_blocks): | |
| if is_overlapped(text_block['bbox'],current_header_bbox): | |
| current_header["element_id"] = text_block["element_id"] | |
| current_header["text_block_id"] = text_block["text_block_id"] | |
| start_index = index | |
| first_append_flag = True | |
| break | |
| for next_header_index, text_block in enumerate(text_blocks[start_index:], start = start_index): | |
| last_text_reached_flag = False | |
| if first_append_flag: | |
| first_row_text = text_block['text'] | |
| first_row_header_text = current_header_text | |
| first_append_text = remove_header_from_start(first_row_text,first_row_header_text) | |
| current_content.append(first_append_text) | |
| page_content_added = True | |
| first_append_flag = False | |
| continue | |
| if next_header_text is not None and pdf_page_num == int(next_header_page_number): | |
| next_header_found_flag = False | |
| if detect_header(text_block['bbox'], adjusted_model_json, next_header_page_number,next_header_index_in_model_udop): | |
| next_header_found_flag = True | |
| matched_data.append({ | |
| "page_number": current_header["page_number"], | |
| "pdf_name" : current_header_pdf_name , | |
| "header": current_header["header_text"], | |
| "label_name": current_header_type, | |
| "content": " ".join(current_content), | |
| "table_content" : current_header_table_content, | |
| "all_source_pages": content_source_pages | |
| }) | |
| tree_format_matched_data.append({ | |
| "header_page_number": current_header["page_number"], | |
| "label_name":current_header_type, | |
| 'page_block_id' : current_header_page_block_id, | |
| "header_bbox": current_header_bbox, | |
| "header_page_width":current_header_page_width, | |
| "header_page_height": current_header_page_height, | |
| "header": current_header["header_text"], | |
| "content": " ".join(current_content), | |
| 'tree_table_content' : current_header_tree_structure | |
| }) | |
| current_content = [] | |
| current_table_content = [] | |
| current_header_tree_structure = [] | |
| next_header_detect_flag = True | |
| break | |
| if next_header_index == len(text_blocks) - 1: | |
| last_text_block = text_block | |
| if not next_header_found_flag and last_text_block: | |
| matched_data.append({ | |
| "page_number": current_header["page_number"], | |
| "pdf_name" : current_header_pdf_name , | |
| "header": current_header["header_text"], | |
| "label_name": current_header_type, | |
| "content": " ".join(current_content), | |
| "table_content" : current_header_table_content, | |
| "all_source_pages": content_source_pages | |
| }) | |
| tree_format_matched_data.append({ | |
| "header_page_number": current_header["page_number"], | |
| "label_name":currentHeaderType, | |
| 'page_block_id' : current_header_page_block_id, | |
| "header_bbox": current_header_bbox, | |
| "header_page_width":current_header_page_width, | |
| "header_page_height": current_header_page_height, | |
| "header": current_header["header_text"], | |
| "content": " ".join(current_content), | |
| 'tree_table_content' : current_header_tree_structure | |
| }) | |
| current_content = [] | |
| current_header_table_content = [] | |
| current_header_tree_structure = [] | |
| next_header_detect_flag = True | |
| next_header_found_flag = True | |
| break | |
| current_content.append(text_block['text']) | |
| page_content_added = True | |
| if next_header_detect_flag: | |
| break | |
| # Add page number to source pages if content was added from this page | |
| if page_content_added and pdf_page_num not in content_source_pages: | |
| content_source_pages.append(pdf_page_num) | |
| if next_header_detect_flag: | |
| break | |
| if next_header_text is None and next_header_page_number is None: | |
| current_header = { | |
| "page_number": current_header_page_number, | |
| "header_text": current_header_text, | |
| "element_id": None, | |
| "text_block_id": None | |
| } | |
| for pdf_page_num in range(int(current_header_page_number), last_pdf_page + 1): | |
| text_blocks = pdfminer_json.get(str(pdf_page_num), []) | |
| start_index = 0 | |
| page_content_added = False # Track if content was added from this page | |
| if current_header["element_id"] is None and current_header["text_block_id"] is None: | |
| for index, text_block in enumerate(text_blocks): | |
| if is_overlapped(text_block['bbox'],current_header_bbox): | |
| current_header["element_id"] = text_block["element_id"] | |
| current_header["text_block_id"] = text_block["text_block_id"] | |
| start_index = index | |
| first_append_flag = True | |
| break | |
| for no_header_index, text_block in enumerate(text_blocks[start_index:], start=start_index): | |
| if first_append_flag: | |
| first_row_text = text_block['text'] | |
| first_row_header_text = current_header_text | |
| first_append_text = remove_header_from_start(first_row_text,first_row_header_text) | |
| current_content.append(first_append_text) | |
| page_content_added = True | |
| first_append_flag = False | |
| continue | |
| # Add page number to source pages if content was added from this page | |
| if page_content_added and pdf_page_num not in content_source_pages: | |
| content_source_pages.append(pdf_page_num) | |
| matched_data.append({ | |
| "page_number": current_header["page_number"], | |
| "pdf_name" : current_header_pdf_name , | |
| "header": current_header["header_text"], | |
| "label_name": current_header_type, | |
| "content": " ".join(current_content), | |
| "table_content" : current_header_table_content, | |
| "all_source_pages": content_source_pages | |
| }) | |
| tree_format_matched_data.append({ | |
| "header_page_number": current_header["page_number"], | |
| "label_name": current_header_type, | |
| 'page_block_id' : current_header_page_block_id, | |
| "header_bbox": current_header_bbox, | |
| "header_page_width":current_header_page_width, | |
| "header_page_height": current_header_page_height, | |
| "header": current_header["header_text"], | |
| "content": " ".join(current_content), | |
| 'tree_table_content' : current_header_tree_structure | |
| }) | |
| return matched_data,tree_format_matched_data | |
| def main_header_pipeline(modified_udop_json, pdfminer_json): | |
| modified_udop_json = adjust_page_dimensions_and_bbox(modified_udop_json, pdfminer_json) | |
| matched_data,tree_format_matched_data= match_headers_with_text(modified_udop_json, pdfminer_json) | |
| df = pd.DataFrame(matched_data) | |
| return df,tree_format_matched_data | |