Spaces:
Configuration error
Configuration error
File size: 3,788 Bytes
27a8994 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 |
from unstructured.partition.pdf import partition_pdf
from langchain_openai import ChatOpenAI
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.output_parsers import StrOutputParser
from dotenv import load_dotenv
load_dotenv()
def get_images_base64(chunks):
images_b64 = []
for chunk in chunks:
if "CompositeElement" in str(type(chunk)):
chunk_els = chunk.metadata.orig_elements
for el in chunk_els:
if "Image" in str(type(el)):
images_b64.append(el.metadata.image_base64)
return images_b64
def LoadAndExtractData(file_path):
try:
# separate tables from texts
tables = []
texts = []
print(">> Extracting Data")
data = partition_pdf(
filename=file_path,
infer_table_structure=True, # extract tables
# strategy="hi_res", # mandatory to infer tables
extract_image_block_types=["Image"], # Add 'Tabl
extract_image_block_to_payload=True, # if true, will extract base64 for API usage
chunking_strategy="by_title", # or 'basic'
max_characters=10000, # defaults to 500
combine_text_under_n_chars=2000, # defaults to 0
new_after_n_chars=6000,
# extract_images_in_pdf=True, # deprecated
)
# Extract the tables and text
print(">> Extracting Text and tables...")
for chunk in data:
if "Table" in str(type(chunk)):
tables.append(chunk)
if "CompositeElement" in str(type((chunk))):
texts.append(chunk)
print(">> Chunks are: ",data)
# extract the image
print(">> Extracting Images...")
images = get_images_base64(data)
return tables ,texts, images
except Exception as e:
print("Error is: ",str(e))
return [], [], str(e)
# Summarizer Function
def Summarizer(prompt_template, data, config=True, set_messages=False):
"""
This function summarizes documents using a prompt template and the ChatOpenAI model.
Args:
prompt_template (str): Template string for the prompt.
data (List[Dict] or List[str]): Input data to be summarized.
config (bool): Whether to run the chain with concurrency limit.
set_messages (bool): Whether to set messages as chat messages with an image.
Returns:
List[str]: List of summaries.
"""
try:
# api_key = os.getenv()
if set_messages:
messages = [
(
"user",
[
{"type": "text", "text": prompt_template},
{
"type": "image_url",
"image_url": {"url": "data:image/jpeg;base64,{image}"},
},
],
)
]
prompt = ChatPromptTemplate.from_messages(messages)
model = ChatOpenAI(temperature=0.5, model="gpt-4o-mini")
summarize_chain = {"image": lambda x: x} | prompt | model | StrOutputParser()
else:
prompt = ChatPromptTemplate.from_template(prompt_template)
model = ChatOpenAI(temperature=0.5, model="gpt-4o-mini")
summarize_chain = {"element": lambda x: x} | prompt | model | StrOutputParser()
if config:
return summarize_chain.batch(data, {"max_concurrency": 3})
else:
return summarize_chain.batch(data)
except Exception as e:
return str(e)
|