File size: 3,788 Bytes
27a8994
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
from unstructured.partition.pdf import partition_pdf
from langchain_openai import ChatOpenAI
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.output_parsers import StrOutputParser
from dotenv import load_dotenv
load_dotenv()


def get_images_base64(chunks):
    images_b64 = []
    for chunk in chunks:
        if "CompositeElement" in str(type(chunk)):
            chunk_els = chunk.metadata.orig_elements
            for el in chunk_els:
                if "Image" in str(type(el)):
                    images_b64.append(el.metadata.image_base64)
    return images_b64


def LoadAndExtractData(file_path):
    try:
        # separate tables from texts
        tables = []
        texts = []

        print(">> Extracting Data")
        data = partition_pdf(
        filename=file_path,
        infer_table_structure=True,            # extract tables
        # strategy="hi_res",                     # mandatory to infer tables

        extract_image_block_types=["Image"],   # Add 'Tabl

        extract_image_block_to_payload=True,   # if true, will extract base64 for API usage

        chunking_strategy="by_title",          # or 'basic'
        max_characters=10000,                  # defaults to 500
        combine_text_under_n_chars=2000,       # defaults to 0
        new_after_n_chars=6000,

        # extract_images_in_pdf=True,          # deprecated
    )
        
        # Extract the tables and text
        print(">> Extracting Text and tables...")
        for chunk in data:
            if "Table" in str(type(chunk)):
                tables.append(chunk)

            if "CompositeElement" in str(type((chunk))):
                texts.append(chunk)
        print(">> Chunks are: ",data)
        # extract the image
        print(">> Extracting Images...")
        images = get_images_base64(data)
        return  tables ,texts, images
    except Exception as e:
        print("Error is: ",str(e))
        return [], [], str(e)
    


# Summarizer Function
def Summarizer(prompt_template, data, config=True, set_messages=False):
    """

    This function summarizes documents using a prompt template and the ChatOpenAI model.

    

    Args:

        prompt_template (str): Template string for the prompt.

        data (List[Dict] or List[str]): Input data to be summarized.

        config (bool): Whether to run the chain with concurrency limit.

        set_messages (bool): Whether to set messages as chat messages with an image.



    Returns:

        List[str]: List of summaries.

    """
    try:
        # api_key = os.getenv()
        if set_messages:
            messages = [
                (
                    "user",
                    [
                        {"type": "text", "text": prompt_template},
                        {
                            "type": "image_url",
                            "image_url": {"url": "data:image/jpeg;base64,{image}"},
                        },
                    ],
                )
            ]
            prompt = ChatPromptTemplate.from_messages(messages)
            model = ChatOpenAI(temperature=0.5, model="gpt-4o-mini")
            summarize_chain = {"image": lambda x: x} | prompt | model | StrOutputParser()
        else:
            prompt = ChatPromptTemplate.from_template(prompt_template)
            model = ChatOpenAI(temperature=0.5, model="gpt-4o-mini")
            summarize_chain = {"element": lambda x: x} | prompt | model | StrOutputParser()
        

        if config:
            return summarize_chain.batch(data, {"max_concurrency": 3})
        else:
            return summarize_chain.batch(data)
    except Exception as e:
        return str(e)