Spaces:
Sleeping
Sleeping
File size: 3,834 Bytes
9ee83a7 2d902f4 9ee83a7 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 |
# add module
import os
import shutil
import sys
from subprocess import call
from grobid_client.grobid_client import GrobidClient
module_path = os.path.abspath(os.path.join('/project'))
if module_path not in sys.path:
sys.path.append(module_path)
from core.tei import single_entry
temp_dir = '/project/temp'
pdffigures2_home = '/opt/pdffigures2'
grobid_home = '/opt/grobid'
grobid_python_config_pth = '/opt/grobid_client_python/config.json'
def remove_temp_directory():
if os.path.exists(temp_dir):
shutil.rmtree(temp_dir)
def grobid_clident():
return GrobidClient(config_path=grobid_python_config_pth)
def process_pdf(pdf_pth: str, file_name: str):
"""This function will preprocess pdf, generate xml, extract figures, and then move all things to /project/temp"""
client = grobid_clident()
remove_temp_directory()
name = file_name[:-4]
if not os.path.exists(temp_dir):
os.makedirs(temp_dir)
temp_pdf_dir = os.path.join(temp_dir, name, 'pdf')
if not os.path.exists(temp_pdf_dir):
os.makedirs(temp_pdf_dir)
temp_xml_dir = os.path.join(temp_dir, name, 'xml')
if not os.path.exists(temp_xml_dir):
os.makedirs(temp_xml_dir)
# copy pdf to temp dir
shutil.copy(pdf_pth, temp_pdf_dir)
# process to xml
client.process(
'processFulltextDocument',
temp_pdf_dir,
tei_coordinates=True,
force=True,
verbose=True,
output=temp_xml_dir,
)
xml_name = name + '.tei.xml'
xml_pth = os.path.join(temp_xml_dir, xml_name)
# now scan figures
fig_dir_profix = 'figure'
img_dir_profix = 'figure/image'
json_dir_profix = 'figure/json'
tmp_fig_dir = os.path.join(pdffigures2_home, fig_dir_profix)
if not os.path.exists(tmp_fig_dir):
os.makedirs(tmp_fig_dir)
tmp_img_dir = os.path.join(pdffigures2_home, img_dir_profix)
if not os.path.exists(tmp_img_dir):
os.makedirs(tmp_img_dir)
tmp_json_dir = os.path.join(pdffigures2_home, json_dir_profix)
if not os.path.exists(tmp_json_dir):
os.makedirs(tmp_json_dir)
args = [
'sbt',
'-J-Xmx4G',
'runMain org.allenai.pdffigures2.FigureExtractorBatchCli -e -q ' + os.path.abspath(temp_pdf_dir) + '/' + ' -m ' + './' + img_dir_profix + '/' + ' -d ' + './' + json_dir_profix + '/' + ' -s ' + './' + fig_dir_profix + '/stat.json',
]
call(args, cwd=pdffigures2_home)
shutil.move(tmp_fig_dir, os.path.join(temp_dir, name))
figure_json_pth = os.path.join(temp_dir, name, 'figure/json', name + '.json')
# merge to single json
_, title, abstract, text, headers, figures = single_entry('', xml_pth=xml_pth, fig_json_pth=figure_json_pth)
temp_json_dir = os.path.join(temp_dir, name, 'json')
if not os.path.exists(temp_json_dir):
os.makedirs(temp_json_dir)
json_data = {
'title': title,
'abstract': abstract,
'text': text,
'headers': headers,
'figures': figures,
}
import json
json_pth = os.path.join(temp_json_dir, name + '.json')
with open(json_pth, 'w') as f:
json.dump(json_data, f, indent=4)
# get preprocessed data
with open(json_pth, 'r') as f:
data = json.load(f)
paper_length = len(data['text'])
sections = [{
'idx': i,
'title': head['section'],
'n': head['n'],
'text': ' '.join([data['text'][idx]['string'] for idx in range(head['start'], min(head['end'] + 1, paper_length))]),
'matched_slides': [],
} for i, head in enumerate(data['headers'])]
with open(os.path.join(temp_dir, name, name + '.preprocessed_text.json'), 'w') as f:
json.dump([sec['text'] for sec in sections], f, indent=4)
if __name__ == '__main__':
process_pdf('/project/example/example.pdf')
|