Spaces:
Sleeping
Sleeping
# MIT License | |
# | |
# Copyright (c) 2024 dataforgood | |
# | |
# Permission is hereby granted, free of charge, to any person obtaining a copy | |
# of this software and associated documentation files (the "Software"), to deal | |
# in the Software without restriction, including without limitation the rights | |
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell | |
# copies of the Software, and to permit persons to whom the Software is | |
# furnished to do so, subject to the following conditions: | |
# | |
# The above copyright notice and this permission notice shall be included in all | |
# copies or substantial portions of the Software. | |
# | |
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR | |
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, | |
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE | |
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER | |
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, | |
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE | |
# SOFTWARE. | |
# Standard imports | |
import logging | |
# Local imports | |
from . import pagefilter, table_extraction | |
from .utils.utils import keep_pages | |
class ReportProcessor: | |
def __init__(self, config: dict) -> None: | |
# Report filter | |
self.page_filter = pagefilter.from_config(config["pagefilter"]) | |
self.table_extractors = [] | |
self.table_cleaners = [] | |
# Tables extraction | |
if "table_extraction" in config: | |
table_extractors = config["table_extraction"] | |
self.table_extractors = [ | |
table_extraction.from_config(name) for name in table_extractors | |
] | |
# Table cleaning & reformatting | |
# We can do this step only if we had table extraction algorithms | |
# otherwise, the assets will not be available | |
#if "table_cleaning" in config: | |
# table_cleaners = config["table_cleaning"] | |
# self.table_cleaners = [ | |
# table_cleaning.from_config(name) for name in table_cleaners | |
# ] | |
def process(self, pdf_filepath: str) -> dict: | |
logging.info(f"Processing {pdf_filepath}") | |
assets = { | |
"pagefilter": {}, | |
"table_extractors": [], | |
"table_cleaners": [], | |
} | |
# Identifying the pages to extract | |
self.page_filter(pdf_filepath, assets) | |
# Now that we identified the pages to be extracted, we extract them | |
# Note, in a GUI, we could ask the user to the change the content of | |
# assets["pagefilter"]["selected_pages"] before selecting the pages | |
pdf_to_process = keep_pages( | |
pdf_filepath, | |
assets["pagefilter"]["selected_pages"], | |
) | |
# Process the selected pages to detect the tables and extract | |
# their contents | |
for table_extractor in self.table_extractors: | |
new_asset = table_extractor(pdf_to_process) | |
assets["table_extractors"].append(new_asset) | |
# Give the parsed content to the cleaner stage for getting organized data | |
#for table_cleaner in self.table_cleaners: | |
# for asset in assets["table_extractors"]: | |
# new_asset = table_cleaner(asset) | |
# assets["table_cleaners"].append(new_asset) | |
return assets | |