nianlonggu
init
02ae0bf
import os
import json
import argparse
import time
from typing import Optional, Dict
from doc2json.tex2json.tex_to_xml import convert_latex_to_s2orc_json
from doc2json.tex2json.xml_to_json import convert_latex_xml_to_s2orc_json
BASE_TEMP_DIR = 'temp'
BASE_OUTPUT_DIR = 'output'
BASE_LOG_DIR = 'log'
def process_tex_stream(
fname: str,
stream: bytes,
temp_dir: str=BASE_TEMP_DIR,
keep_flag: bool=False,
grobid_config: Optional[Dict] = None
):
"""
Process a gz file stream
:param fname:
:param stream:
:param temp_dir:
:param keep_flag:
:param grobid_config:
:return:
"""
temp_input_dir = os.path.join(temp_dir, 'input')
temp_input_file = os.path.join(temp_input_dir, fname)
os.makedirs(temp_dir, exist_ok=True)
os.makedirs(temp_input_dir, exist_ok=True)
with open(temp_input_file, 'wb') as outf:
outf.write(stream)
output_file = process_tex_file(
temp_input_file, temp_dir=temp_dir, keep_flag=keep_flag, grobid_config=grobid_config
)
if os.path.exists(output_file):
with open(output_file, 'r') as f:
contents = json.load(f)
return contents
else:
return []
def process_tex_file(
input_file: str,
temp_dir: str=BASE_TEMP_DIR,
output_dir: str=BASE_OUTPUT_DIR,
log_dir: str=BASE_LOG_DIR,
keep_flag: bool=False,
grobid_config: Optional[Dict]=None
) -> Optional[str]:
"""
Process files in a TEX zip and get JSON representation
:param input_file:
:param temp_dir:
:param output_dir:
:param log_dir:
:param keep_flag:
:param grobid_config:
:return:
"""
# create directories
os.makedirs(temp_dir, exist_ok=True)
os.makedirs(output_dir, exist_ok=True)
os.makedirs(log_dir, exist_ok=True)
# get paper id as the name of the file
paper_id = os.path.splitext(input_file)[0].split('/')[-1]
output_file = os.path.join(output_dir, f'{paper_id}.json')
cleanup_flag = not keep_flag
# check if input file exists and output file doesn't
if not os.path.exists(input_file):
raise FileNotFoundError(f"{input_file} doesn't exist")
if os.path.exists(output_file):
print(f'{output_file} already exists!')
# process LaTeX
xml_file = convert_latex_to_s2orc_json(input_file, temp_dir, cleanup_flag)
if not xml_file:
return None
# convert to S2ORC
paper = convert_latex_xml_to_s2orc_json(xml_file, log_dir, grobid_config=grobid_config)
# write to file
with open(output_file, 'w') as outf:
json.dump(paper.release_json("latex"), outf, indent=4, sort_keys=False)
return output_file
if __name__ == '__main__':
parser = argparse.ArgumentParser(description="Run S2ORC TEX2JSON")
parser.add_argument("-i", "--input", default=None, help="path to the input TEX zip file")
parser.add_argument("-t", "--temp", default='temp', help="path to a temp dir for partial files")
parser.add_argument("-o", "--output", default='output', help="path to the output dir for putting json files")
parser.add_argument("-l", "--log", default='log', help="path to the log dir")
parser.add_argument("-k", "--keep", default=False, help="keep temporary files")
args = parser.parse_args()
input_path = args.input
temp_path = args.temp
output_path = args.output
log_path = args.log
keep_temp = args.keep
start_time = time.time()
os.makedirs(temp_path, exist_ok=True)
os.makedirs(output_path, exist_ok=True)
process_tex_file(input_path, temp_path, output_path, log_path, keep_temp)
runtime = round(time.time() - start_time, 3)
print("runtime: %s seconds " % (runtime))
print('done.')