""" Many of the REGEX expressions and pipeline in this set of utilities are borrowed or extended from the unarXive project: https://github.com/IllDepence/unarXive Modifications have been made to better identify the primary latex file and expand all other latex files into the main file. Latexpand and tralics options have also been changed. """ import chardet import magic import os import re import glob import subprocess import tempfile MAIN_TEX_PATT = re.compile(r'(\\begin\s*\{\s*document\s*\})', re.I) # ^ with capturing parentheses so that the pattern can be used for splitting PDF_EXT_PATT = re.compile(r'^\.pdf$', re.I) GZ_EXT_PATT = re.compile(r'^\.gz$', re.I) TEX_EXT_PATT = re.compile(r'^\.tex$', re.I) NON_TEXT_PATT = re.compile(r'^\.(pdf|eps|jpg|png|gif)$', re.I) BBL_SIGN = '\\bibitem' # natbib fix PRE_FIX_NATBIB = True NATBIB_PATT = re.compile((r'\\cite(t|p|alt|alp|author|year|yearpar)\s*?\*?\s*?' '(\[[^\]]*?\]\s*?)*?\s*?\*?\s*?\{([^\}]+?)\}'), re.I) # bibitem option fix PRE_FIX_BIBOPT = True BIBOPT_PATT = re.compile(r'\\bibitem\s*?\[[^]]*?\]', re.I|re.M) # ↑ above two solve most tralics problems; except for mnras style bibitems # (https://ctan.org/pkg/mnras) # agressive math pre-removal PRE_FILTER_MATH = False FILTER_PATTS = [] for env in ['equation', 'displaymath', 'array', 'eqnarray', 'align', 'gather', 'multline', 'flalign', 'alignat']: s = r'\\begin\{{{0}[*]?\}}.+?\\end\{{{0}\}}'.format(env) patt = re.compile(s, re.I | re.M | re.S) FILTER_PATTS.append(patt) FILTER_PATTS.append(re.compile(r'\$\$.+?\$\$', re.S)) FILTER_PATTS.append(re.compile(r'\$.+?\$', re.S)) FILTER_PATTS.append(re.compile(r'\\\(.+?\\\)', re.S)) FILTER_PATTS.append(re.compile(r'\\\[.+?\\\]', re.S)) def read_file(path): try: with open(path) as f: cntnt = f.read() except UnicodeDecodeError: blob = open(path, 'rb').read() m = magic.Magic(mime_encoding=True) encoding = m.from_buffer(blob) try: cntnt = blob.decode(encoding) except (UnicodeDecodeError, LookupError) as e: encoding = chardet.detect(blob)['encoding'] if encoding: try: cntnt = blob.decode(encoding, errors='replace') except: return '' else: return '' return cntnt def remove_math(latex_str): parts = re.split(MAIN_TEX_PATT, latex_str, maxsplit=1) for patt in FILTER_PATTS: parts[2] = re.sub(patt, '', parts[2]) return ''.join(parts) def normalize(path, out_dir, write_logs=True): """ Normalize an arXiv file Adapted from https://github.com/IllDepence/unarXive with modifications Identifies the primary *.tex file, the bibliography file, and expands other tex files and the bibliography into the main tex file """ def log(msg): if write_logs: with open(os.path.join(out_dir, 'log.txt'), 'a') as f: f.write('{}\n'.format(msg)) # break path _, fn = os.path.split(path.strip('/')) # identify main tex file main_tex_path = None ignored_names = [] # check .tex files first for tfn in os.listdir(path): if not TEX_EXT_PATT.match(os.path.splitext(tfn)[1]): ignored_names.append(tfn) continue try: cntnt = read_file(os.path.join(path, tfn)) except: continue if re.search(MAIN_TEX_PATT, cntnt) is not None: main_tex_path = tfn # try other files if main_tex_path is None: for tfn in ignored_names: if NON_TEXT_PATT.match(os.path.splitext(tfn)[1]): continue try: cntnt = read_file(os.path.join(path, tfn)) if re.search(MAIN_TEX_PATT, cntnt) is not None: main_tex_path = tfn except: continue # give up if main_tex_path is None: log(('couldn\'t find main tex file in dump archive {}' '').format(fn)) # flatten to single tex file and save with tempfile.TemporaryDirectory() as tmp_dir_path: temp_tex_fn = os.path.join(tmp_dir_path, f'{fn}.tex') # find bbl file main_tex_fn = os.path.join(path, main_tex_path) bbl_files = glob.glob(os.path.join(path, '*.bbl')) if bbl_files: latexpand_args = ['latexpand', '--expand-bbl', os.path.split(bbl_files[0])[1], main_tex_path, '--output', temp_tex_fn] else: latexpand_args = ['latexpand', main_tex_path, '--output', temp_tex_fn] # run latexpand with open(os.path.join(out_dir, 'log_latexpand.txt'), 'a+') as err: subprocess.run(latexpand_args, stderr=err, cwd=path) # re-read and write to ensure utf-8 b/c latexpand doesn't # behave new_tex_fn = os.path.join(out_dir, f'{fn}.tex') cntnt = read_file(temp_tex_fn) if PRE_FIX_NATBIB: cntnt = NATBIB_PATT.sub(r'\\cite{\3}', cntnt) if PRE_FIX_BIBOPT: cntnt = BIBOPT_PATT.sub(r'\\bibitem', cntnt) if PRE_FILTER_MATH: cntnt = remove_math(cntnt) with open(new_tex_fn, mode='w', encoding='utf-8') as f: f.write(cntnt) def latex_to_xml(tex_file: str, out_dir: str, out_file: str, err_file: str, log_file: str): """ Convert expanded latex file to XML using tralics :param tex_file: :param out_dir: :param out_file: :param err_file: :param log_file: :return: """ with open(os.devnull, 'w') as devnull, \ open(err_file, 'a+') as err_f, \ open(log_file, 'a+') as skip_f: # run tralics tralics_args = ['tralics', '-silent', '-noxmlerror', '-utf8', '-oe8', '-entnames=false', '-nomathml', f'-output_dir={out_dir}', tex_file] try: subprocess.run(tralics_args, stdout=devnull, stderr=err_f, timeout=5) except subprocess.TimeoutExpired: skip_f.write(f'{tex_file}\n') # if no output, skip if not os.path.exists(out_file): skip_f.write(f'{tex_file}\n') if os.path.exists(out_file): return out_file