File size: 3,834 Bytes
9ee83a7
 
 
 
 
 
 
 
 
 
 
 
 
 
2d902f4
 
 
 
9ee83a7
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
# add module
import os
import shutil
import sys
from subprocess import call

from grobid_client.grobid_client import GrobidClient

module_path = os.path.abspath(os.path.join('/project'))
if module_path not in sys.path:
    sys.path.append(module_path)

from core.tei import single_entry

temp_dir = '/project/temp'
pdffigures2_home = '/opt/pdffigures2'
grobid_home = '/opt/grobid'
grobid_python_config_pth = '/opt/grobid_client_python/config.json'


def remove_temp_directory():
    if os.path.exists(temp_dir):
        shutil.rmtree(temp_dir)


def grobid_clident():
    return GrobidClient(config_path=grobid_python_config_pth)


def process_pdf(pdf_pth: str, file_name: str):
    """This function will preprocess pdf, generate xml, extract figures, and then move all things to /project/temp"""

    client = grobid_clident()
    remove_temp_directory()

    name = file_name[:-4]

    if not os.path.exists(temp_dir):
        os.makedirs(temp_dir)
    temp_pdf_dir = os.path.join(temp_dir, name, 'pdf')
    if not os.path.exists(temp_pdf_dir):
        os.makedirs(temp_pdf_dir)
    temp_xml_dir = os.path.join(temp_dir, name, 'xml')
    if not os.path.exists(temp_xml_dir):
        os.makedirs(temp_xml_dir)

    # copy pdf to temp dir
    shutil.copy(pdf_pth, temp_pdf_dir)

    # process to xml
    client.process(
        'processFulltextDocument',
        temp_pdf_dir,
        tei_coordinates=True,
        force=True,
        verbose=True,
        output=temp_xml_dir,
    )

    xml_name = name + '.tei.xml'
    xml_pth = os.path.join(temp_xml_dir, xml_name)

    # now scan figures
    fig_dir_profix = 'figure'
    img_dir_profix = 'figure/image'
    json_dir_profix = 'figure/json'

    tmp_fig_dir = os.path.join(pdffigures2_home, fig_dir_profix)
    if not os.path.exists(tmp_fig_dir):
        os.makedirs(tmp_fig_dir)
    tmp_img_dir = os.path.join(pdffigures2_home, img_dir_profix)
    if not os.path.exists(tmp_img_dir):
        os.makedirs(tmp_img_dir)
    tmp_json_dir = os.path.join(pdffigures2_home, json_dir_profix)
    if not os.path.exists(tmp_json_dir):
        os.makedirs(tmp_json_dir)

    args = [
        'sbt',
        '-J-Xmx4G',
        'runMain org.allenai.pdffigures2.FigureExtractorBatchCli -e -q ' + os.path.abspath(temp_pdf_dir) + '/' + ' -m ' + './' + img_dir_profix + '/' + ' -d ' + './' + json_dir_profix + '/' + ' -s ' + './' + fig_dir_profix + '/stat.json',
    ]
    call(args, cwd=pdffigures2_home)

    shutil.move(tmp_fig_dir, os.path.join(temp_dir, name))

    figure_json_pth = os.path.join(temp_dir, name, 'figure/json', name + '.json')

    # merge to single json
    _, title, abstract, text, headers, figures = single_entry('', xml_pth=xml_pth, fig_json_pth=figure_json_pth)

    temp_json_dir = os.path.join(temp_dir, name, 'json')
    if not os.path.exists(temp_json_dir):
        os.makedirs(temp_json_dir)

    json_data = {
        'title': title,
        'abstract': abstract,
        'text': text,
        'headers': headers,
        'figures': figures,
    }

    import json
    json_pth = os.path.join(temp_json_dir, name + '.json')
    with open(json_pth, 'w') as f:
        json.dump(json_data, f, indent=4)

    # get preprocessed data
    with open(json_pth, 'r') as f:
        data = json.load(f)
    paper_length = len(data['text'])
    sections = [{
        'idx': i,
        'title': head['section'],
        'n': head['n'],
        'text': ' '.join([data['text'][idx]['string'] for idx in range(head['start'], min(head['end'] + 1, paper_length))]),
        'matched_slides': [],
    } for i, head in enumerate(data['headers'])]

    with open(os.path.join(temp_dir, name, name + '.preprocessed_text.json'), 'w') as f:
        json.dump([sec['text'] for sec in sections], f, indent=4)


if __name__ == '__main__':
    process_pdf('/project/example/example.pdf')