Spaces:
Build error
Build error
File size: 1,816 Bytes
230c9a6 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 |
import os
import sys
import os.path as osp
import argparse
from pdf2markdown import PDF2MARKDOWN
sys.path.append(osp.join(os.path.dirname(os.path.abspath(__file__)), '..', '..', '..'))
from pdf_extract_kit.utils.config_loader import load_config, initialize_tasks_and_models
from pdf_extract_kit.registry.registry import TASK_REGISTRY
TASK_NAME = 'pdf2markdown'
def parse_args():
parser = argparse.ArgumentParser(description="Run a task with a given configuration file.")
parser.add_argument('--config', type=str, required=True, help='Path to the configuration file.')
return parser.parse_args()
def main(config_path):
config = load_config(config_path)
task_instances = initialize_tasks_and_models(config)
# get input and output path from config
input_data = config.get('inputs', None)
result_path = config.get('outputs', 'outputs/pdf_extract')
visualize = config.get('visualize', False)
merge2markdown = config.get('merge2markdown', False)
layout_model = task_instances['layout_detection'].model if 'layout_detection' in task_instances else None
mfd_model = task_instances['formula_detection'].model if 'formula_detection' in task_instances else None
mfr_model = task_instances['formula_recognition'].model if 'formula_recognition' in task_instances else None
ocr_model = task_instances['ocr'].model if 'ocr' in task_instances else None
pdf_extract_task = TASK_REGISTRY.get(TASK_NAME)(layout_model, mfd_model, mfr_model, ocr_model)
extract_results = pdf_extract_task.process(input_data, save_dir=result_path, visualize=visualize, merge2markdown=merge2markdown)
print(f'Task done, results can be found at {result_path}')
if __name__ == "__main__":
args = parse_args()
main(args.config)
|