Spaces:
Build error
Build error
import os | |
import json | |
import argparse | |
import time | |
from typing import Optional, Dict | |
from doc2json.tex2json.tex_to_xml import convert_latex_to_s2orc_json | |
from doc2json.tex2json.xml_to_json import convert_latex_xml_to_s2orc_json | |
BASE_TEMP_DIR = 'temp' | |
BASE_OUTPUT_DIR = 'output' | |
BASE_LOG_DIR = 'log' | |
def process_tex_stream( | |
fname: str, | |
stream: bytes, | |
temp_dir: str=BASE_TEMP_DIR, | |
keep_flag: bool=False, | |
grobid_config: Optional[Dict] = None | |
): | |
""" | |
Process a gz file stream | |
:param fname: | |
:param stream: | |
:param temp_dir: | |
:param keep_flag: | |
:param grobid_config: | |
:return: | |
""" | |
temp_input_dir = os.path.join(temp_dir, 'input') | |
temp_input_file = os.path.join(temp_input_dir, fname) | |
os.makedirs(temp_dir, exist_ok=True) | |
os.makedirs(temp_input_dir, exist_ok=True) | |
with open(temp_input_file, 'wb') as outf: | |
outf.write(stream) | |
output_file = process_tex_file( | |
temp_input_file, temp_dir=temp_dir, keep_flag=keep_flag, grobid_config=grobid_config | |
) | |
if os.path.exists(output_file): | |
with open(output_file, 'r') as f: | |
contents = json.load(f) | |
return contents | |
else: | |
return [] | |
def process_tex_file( | |
input_file: str, | |
temp_dir: str=BASE_TEMP_DIR, | |
output_dir: str=BASE_OUTPUT_DIR, | |
log_dir: str=BASE_LOG_DIR, | |
keep_flag: bool=False, | |
grobid_config: Optional[Dict]=None | |
) -> Optional[str]: | |
""" | |
Process files in a TEX zip and get JSON representation | |
:param input_file: | |
:param temp_dir: | |
:param output_dir: | |
:param log_dir: | |
:param keep_flag: | |
:param grobid_config: | |
:return: | |
""" | |
# create directories | |
os.makedirs(temp_dir, exist_ok=True) | |
os.makedirs(output_dir, exist_ok=True) | |
os.makedirs(log_dir, exist_ok=True) | |
# get paper id as the name of the file | |
paper_id = os.path.splitext(input_file)[0].split('/')[-1] | |
output_file = os.path.join(output_dir, f'{paper_id}.json') | |
cleanup_flag = not keep_flag | |
# check if input file exists and output file doesn't | |
if not os.path.exists(input_file): | |
raise FileNotFoundError(f"{input_file} doesn't exist") | |
if os.path.exists(output_file): | |
print(f'{output_file} already exists!') | |
# process LaTeX | |
xml_file = convert_latex_to_s2orc_json(input_file, temp_dir, cleanup_flag) | |
if not xml_file: | |
return None | |
# convert to S2ORC | |
paper = convert_latex_xml_to_s2orc_json(xml_file, log_dir, grobid_config=grobid_config) | |
# write to file | |
with open(output_file, 'w') as outf: | |
json.dump(paper.release_json("latex"), outf, indent=4, sort_keys=False) | |
return output_file | |
if __name__ == '__main__': | |
parser = argparse.ArgumentParser(description="Run S2ORC TEX2JSON") | |
parser.add_argument("-i", "--input", default=None, help="path to the input TEX zip file") | |
parser.add_argument("-t", "--temp", default='temp', help="path to a temp dir for partial files") | |
parser.add_argument("-o", "--output", default='output', help="path to the output dir for putting json files") | |
parser.add_argument("-l", "--log", default='log', help="path to the log dir") | |
parser.add_argument("-k", "--keep", default=False, help="keep temporary files") | |
args = parser.parse_args() | |
input_path = args.input | |
temp_path = args.temp | |
output_path = args.output | |
log_path = args.log | |
keep_temp = args.keep | |
start_time = time.time() | |
os.makedirs(temp_path, exist_ok=True) | |
os.makedirs(output_path, exist_ok=True) | |
process_tex_file(input_path, temp_path, output_path, log_path, keep_temp) | |
runtime = round(time.time() - start_time, 3) | |
print("runtime: %s seconds " % (runtime)) | |
print('done.') | |