import boto3 

from utility import terminal_print, create_md_table
from application import aws_access_key_id, aws_secret_access_key, default_s3_bucket

textract = boto3.client(
    'textract',
    aws_access_key_id=aws_access_key_id,
    aws_secret_access_key=aws_secret_access_key,
    region_name='us-east-1')

@terminal_print
def textract_get_tables(res_tables,textract=textract):
    '''
    This function is used to get the tables from the textract output
    
    Parameters:
    res_tables: the output from the textract.get_document_analysis function
    textract: the boto3 client for textract
    
    Returns:
    result: the cascaded output with blocks from the textract.get_document_analysis function
    '''
    job_id = res_tables["JobId"]
    temp = result = res_tables.copy()
    
    while "NextToken" in temp:
        temp = textract.get_document_analysis(JobId=job_id,NextToken=temp["NextToken"])
        result["Blocks"].extend(temp["Blocks"])

    return result

@terminal_print
def textract_get_text(res_text,textract=textract):
    '''
    This function is used to get the text from the textract output

    Parameters:
    res_text: the output from the textract.get_document_text_detection function
    textract: the boto3 client for textract

    Returns:
    result: the cascaded output with blocks from the textract.get_document_text_detection function
    '''
    job_id = res_text["JobId"]
    temp = result = res_text.copy()
    
    while "NextToken" in temp:
        temp = textract.get_document_text_detection(JobId=job_id,NextToken=temp["NextToken"])
        result["Blocks"].extend(temp["Blocks"])

    return result

@terminal_print
def get_article_tables(file_name:str,bucket:str,delay:int=5):
    '''
    This function is used to get the tables from the textract output
    
    Parameters:
    file_name: the name of the file in the bucket
    bucket: the name of the bucket
    delay: the delay time for the textract.get_document_analysis function
    
    Returns:
    res_tables: the output from the textract.get_document_analysis function with initial blocks
    '''
    import time
    # need to use async method to process the files
    job_tables = textract.start_document_analysis(
        DocumentLocation={
            "S3Object":{
                "Bucket":bucket,
                "Name": file_name
                }
            },
        FeatureTypes=["TABLES"]
        )

    table_job_id = job_tables["JobId"]
    res_tables = {"JobStatus":"IN_PROGRESS"}
    
    while res_tables["JobStatus"] == "IN_PROGRESS":
        time.sleep(delay)
        res_tables = textract.get_document_analysis(JobId=table_job_id)
    
    res_tables["JobId"] = table_job_id

    return res_tables

@terminal_print
def get_article_text(file_name:str,bucket:str,delay:int=5):
    '''
    This function is used to get the text from the textract output

    Parameters:
    file_name: the name of the file in the bucket
    bucket: the name of the bucket
    delay: the delay time for the textract.get_document_text_detection function

    Returns:
    res_text: the output from the textract.get_document_text_detection function with initial blocks
    '''
    import time
    job_text = textract.start_document_text_detection(
        DocumentLocation={
            "S3Object":{
                "Bucket":bucket,
                "Name": file_name
                }
            }
        )
    
    text_job_id = job_text["JobId"]
    res_text = {"JobStatus":"IN_PROGRESS"}

    while res_text["JobStatus"] == "IN_PROGRESS":
        time.sleep(delay)
        if res_text["JobStatus"] == "IN_PROGRESS":
            res_text = textract.get_document_text_detection(JobId=text_job_id)

    
    res_text["JobId"] = text_job_id

    return res_text

@terminal_print
def construct_tables(tables):
    '''
    This function is used to construct the tables from the textract output
    
    Parameters:
    tables: the output from the textract.get_document_analysis function
    
    Returns:
    table_blocks: the list of tables with the blocks
    blocks_dict: the dictionary of blocks with the block id as the key
    '''
    blocks = tables["Blocks"]

    blocks_dict = {}
    table_blocks = []

    for b in blocks:
        
        blocks_dict[b["Id"]] = b

        if b["BlockType"] == "TABLE":
            temp = {
                "id":b["Id"],
                "relationship":b["Relationships"],
                "confidence":b["Confidence"],
                "page":b["Page"],
                "map":{}
            }
            table_blocks.append(temp)

    for t in table_blocks:
        for e in t["relationship"]:
            t["map"].update({id:{"Type":e["Type"]} for id in e["Ids"]})

        for id in t["map"]:
            component = blocks_dict[id]
            if component["BlockType"] not in t:
                t[component["BlockType"]] = []
            t[component["BlockType"]].append(component)
            
        # table_blocks.append(t)
 
    return table_blocks, blocks_dict

# Transfer the table blocks from aws textract into a table
@terminal_print
def textract_output_to_table(table,blocks_dict):
    '''
    This function is used to transfer the table blocks from aws textract into a table
    
    Parameters:
    table: the table block from the textract output
    blocks_dict: the dictionary of blocks with the block id as the key
    
    Returns:
    array: the table array with the text from the table blocks
    '''
    array = [[]]
    cur_row = 1
    for c in table["CELL"]:
        r_id = c["RowIndex"]
        
        if r_id > cur_row:
            array.append([])
            cur_row = r_id
        if "Relationships" in c:
            words = [blocks_dict[i]["Text"] for i in  c["Relationships"][0]["Ids"] if blocks_dict[i]["BlockType"] == "WORD"]
        else:
            words =[""]
        # print(c["RowIndex"],c["ColumnIndex"]," ".join(words))
        array[-1].append(" ".join(words))

    return array

@terminal_print
def get_tables(filename:str,bucket:str=default_s3_bucket):
    '''
    This function is used to get the tables from the textract output
    
    Parameters:
    filename: the name of the file in the bucket
    bucket: the name of the bucket
    
    Returns:
    md_tables: the list of tables in markdown format
    '''
    tables_temp = get_article_tables(file_name=filename,bucket=bucket)

    tables = textract_get_tables(tables_temp)
    table_blocks,block_dict = construct_tables(tables)

    md_tables = []
    for table in table_blocks:
        table_array = textract_output_to_table(table,block_dict)
        md_tables.append(create_md_table(table_array))

    return md_tables