|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
import logging |
|
|
|
|
|
import os |
|
import uuid |
|
from io import StringIO |
|
from pathlib import Path |
|
|
|
import pandas as pd |
|
from unstructured_client import UnstructuredClient |
|
from unstructured_client.models import shared |
|
|
|
|
|
class UnstructuredAPI: |
|
def __init__(self, **kwargs: dict) -> dict: |
|
""" |
|
Builds a pdf page parser, looking for tables using |
|
the unstructured.io api. |
|
The kwargs given to the constructor are directly propagated |
|
to the partition_pdf function. |
|
You are free to define any parameter partition_pdf recognizes |
|
""" |
|
self.kwargs = kwargs |
|
self.type = "unstructured_api" |
|
|
|
def __call__(self, pdf_filepath: str) -> dict: |
|
logging.info("\nKicking off extraction stage...") |
|
logging.info(f"Extraction type: {self.type}, with params: {self.kwargs}") |
|
|
|
s = UnstructuredClient(api_key_auth=os.getenv("UNSTRUCTURED_API_KEY")) |
|
|
|
with Path(pdf_filepath).open("rb") as f: |
|
|
|
files = shared.Files( |
|
content=f.read(), |
|
file_name=pdf_filepath, |
|
) |
|
|
|
req = shared.PartitionParameters( |
|
files=files, |
|
strategy="hi_res", |
|
pdf_infer_table_structure="True", |
|
**self.kwargs, |
|
) |
|
|
|
try: |
|
resp = s.general.partition(req) |
|
except Exception as e: |
|
print(e) |
|
else: |
|
tables_list = [] |
|
for el in resp.elements: |
|
if el["type"] == "Table": |
|
|
|
|
|
try: |
|
table = pd.read_html(StringIO(el["metadata"]["text_as_html"]))[ |
|
0 |
|
] |
|
except Exception: |
|
logging.info( |
|
"Html table discarded. Pandas couldn't read the table.", |
|
) |
|
else: |
|
tables_list.append(table) |
|
|
|
|
|
new_asset = { |
|
"id": uuid.uuid4(), |
|
"type": "unstructured_api", |
|
"params": self.kwargs, |
|
"tables": tables_list, |
|
} |
|
|
|
return new_asset |
|
|