import boto3 from utility import terminal_print, create_md_table from application import aws_access_key_id, aws_secret_access_key, default_s3_bucket textract = boto3.client( 'textract', aws_access_key_id=aws_access_key_id, aws_secret_access_key=aws_secret_access_key, region_name='us-east-1') @terminal_print def textract_get_tables(res_tables,textract=textract): ''' This function is used to get the tables from the textract output Parameters: res_tables: the output from the textract.get_document_analysis function textract: the boto3 client for textract Returns: result: the cascaded output with blocks from the textract.get_document_analysis function ''' job_id = res_tables["JobId"] temp = result = res_tables.copy() while "NextToken" in temp: temp = textract.get_document_analysis(JobId=job_id,NextToken=temp["NextToken"]) result["Blocks"].extend(temp["Blocks"]) return result @terminal_print def textract_get_text(res_text,textract=textract): ''' This function is used to get the text from the textract output Parameters: res_text: the output from the textract.get_document_text_detection function textract: the boto3 client for textract Returns: result: the cascaded output with blocks from the textract.get_document_text_detection function ''' job_id = res_text["JobId"] temp = result = res_text.copy() while "NextToken" in temp: temp = textract.get_document_text_detection(JobId=job_id,NextToken=temp["NextToken"]) result["Blocks"].extend(temp["Blocks"]) return result @terminal_print def get_article_tables(file_name:str,bucket:str,delay:int=5): ''' This function is used to get the tables from the textract output Parameters: file_name: the name of the file in the bucket bucket: the name of the bucket delay: the delay time for the textract.get_document_analysis function Returns: res_tables: the output from the textract.get_document_analysis function with initial blocks ''' import time # need to use async method to process the files job_tables = textract.start_document_analysis( DocumentLocation={ "S3Object":{ "Bucket":bucket, "Name": file_name } }, FeatureTypes=["TABLES"] ) table_job_id = job_tables["JobId"] res_tables = {"JobStatus":"IN_PROGRESS"} while res_tables["JobStatus"] == "IN_PROGRESS": time.sleep(delay) res_tables = textract.get_document_analysis(JobId=table_job_id) res_tables["JobId"] = table_job_id return res_tables @terminal_print def get_article_text(file_name:str,bucket:str,delay:int=5): ''' This function is used to get the text from the textract output Parameters: file_name: the name of the file in the bucket bucket: the name of the bucket delay: the delay time for the textract.get_document_text_detection function Returns: res_text: the output from the textract.get_document_text_detection function with initial blocks ''' import time job_text = textract.start_document_text_detection( DocumentLocation={ "S3Object":{ "Bucket":bucket, "Name": file_name } } ) text_job_id = job_text["JobId"] res_text = {"JobStatus":"IN_PROGRESS"} while res_text["JobStatus"] == "IN_PROGRESS": time.sleep(delay) if res_text["JobStatus"] == "IN_PROGRESS": res_text = textract.get_document_text_detection(JobId=text_job_id) res_text["JobId"] = text_job_id return res_text @terminal_print def construct_tables(tables): ''' This function is used to construct the tables from the textract output Parameters: tables: the output from the textract.get_document_analysis function Returns: table_blocks: the list of tables with the blocks blocks_dict: the dictionary of blocks with the block id as the key ''' blocks = tables["Blocks"] blocks_dict = {} table_blocks = [] for b in blocks: blocks_dict[b["Id"]] = b if b["BlockType"] == "TABLE": temp = { "id":b["Id"], "relationship":b["Relationships"], "confidence":b["Confidence"], "page":b["Page"], "map":{} } table_blocks.append(temp) for t in table_blocks: for e in t["relationship"]: t["map"].update({id:{"Type":e["Type"]} for id in e["Ids"]}) for id in t["map"]: component = blocks_dict[id] if component["BlockType"] not in t: t[component["BlockType"]] = [] t[component["BlockType"]].append(component) # table_blocks.append(t) return table_blocks, blocks_dict # Transfer the table blocks from aws textract into a table @terminal_print def textract_output_to_table(table,blocks_dict): ''' This function is used to transfer the table blocks from aws textract into a table Parameters: table: the table block from the textract output blocks_dict: the dictionary of blocks with the block id as the key Returns: array: the table array with the text from the table blocks ''' array = [[]] cur_row = 1 for c in table["CELL"]: r_id = c["RowIndex"] if r_id > cur_row: array.append([]) cur_row = r_id if "Relationships" in c: words = [blocks_dict[i]["Text"] for i in c["Relationships"][0]["Ids"] if blocks_dict[i]["BlockType"] == "WORD"] else: words =[""] # print(c["RowIndex"],c["ColumnIndex"]," ".join(words)) array[-1].append(" ".join(words)) return array @terminal_print def get_tables(filename:str,bucket:str=default_s3_bucket): ''' This function is used to get the tables from the textract output Parameters: filename: the name of the file in the bucket bucket: the name of the bucket Returns: md_tables: the list of tables in markdown format ''' tables_temp = get_article_tables(file_name=filename,bucket=bucket) tables = textract_get_tables(tables_temp) table_blocks,block_dict = construct_tables(tables) md_tables = [] for table in table_blocks: table_array = textract_output_to_table(table,block_dict) md_tables.append(create_md_table(table_array)) return md_tables