import sys import regex import yaml import shutil import bibtexparser from charset_normalizer import from_path from langdetect import detect import os import subprocess import numpy as np import networkx as nx import re def is_venv(): return (hasattr(sys, 'real_prefix') or (hasattr(sys, 'base_prefix') and sys.base_prefix != sys.prefix)) def read_yaml_file(file_path): with open(file_path, 'r') as file: try: data = yaml.safe_load(file) return data except yaml.YAMLError as e: print(f"Error reading YAML file: {e}") def read_tex_file(file_path): with open(file_path, 'r', encoding='utf-8') as file: tex_content = file.read() return tex_content def write_tex_file(file_path, s): with open(file_path, 'w', encoding='utf-8') as file: file.write(s) def get_core(s): start = '\\begin{document}' end = '\\end{document}' beginning_doc = s.find(start) end_doc = s.rfind(end) return s[beginning_doc+len(start):end_doc] def retrieve_text(text, command, keep_text=False): """Removes '\\command{*}' from the string 'text'. Regex `base_pattern` used to match balanced parentheses taken from: https://stackoverflow.com/questions/546433/regular-expression-to-match-balanced-parentheses/35271017#35271017 """ base_pattern = ( r'\\' + command + r"(?:\[(?:.*?)\])*\{((?:[^{}]+|\{(?1)\})*)\}(?:\[(?:.*?)\])*" ) def extract_text_inside_curly_braces(text): """Extract text inside of {} from command string""" pattern = r"\{((?:[^{}]|(?R))*)\}" match = regex.search(pattern, text) if match: return match.group(1) else: return "" # Loops in case of nested commands that need to retain text, e.g. \red{hello \red{world}}. while True: all_substitutions = [] has_match = False for match in regex.finditer(base_pattern, text): # In case there are only spaces or nothing up to the following newline, # adds a percent, not to alter the newlines. has_match = True if not keep_text: new_substring = "" else: temp_substring = text[match.span()[0] : match.span()[1]] return extract_text_inside_curly_braces(temp_substring) if match.span()[1] < len(text): next_newline = text[match.span()[1] :].find("\n") if next_newline != -1: text_until_newline = text[ match.span()[1] : match.span()[1] + next_newline ] if ( not text_until_newline or text_until_newline.isspace() ) and not keep_text: new_substring = "%" all_substitutions.append((match.span()[0], match.span()[1], new_substring)) for start, end, new_substring in reversed(all_substitutions): text = text[:start] + new_substring + text[end:] if not keep_text or not has_match: break def reduce_linebreaks(s): return re.sub(r'(\n[ \t]*)+(\n[ \t]*)+', '\n\n', s) def replace_percentage(s): return re.sub(r'% *\n', '\n', s) def reduce_spaces(s): return re.sub(' +', ' ', s) def delete_urls(s): return re.sub(r'http\S+', '', s) def remove_tilde(s): s1 = re.sub(r'[~ ]\.', '.', s) s2 = re.sub(r'[~ ],', ',', s1) return re.sub(r'{}', '', s2) def remove_verbatim_words(s): with open("configs/latex_commands.yaml", "r") as stream: read_config = yaml.safe_load(stream) for command in read_config['verbatim_to_delete']: s = s.replace(command, '') for command in read_config['two_arguments']: pattern = r'\\' + command + r'{[^}]*}' + r'{[^}]*}' s = re.sub(pattern, '', s) for command in read_config['three_arguments']: pattern = r'\\' + command + r'{[^}]*}' + r'{[^}]*}' + r'{[^}]*}' s = re.sub(pattern, '', s) for command in read_config['two_arguments_elaborate']: s = remove_multargument(s, '\\' + command, 2) for command in read_config['three_arguments_elaborate']: s = remove_multargument(s, '\\' + command, 3) for command in read_config['replace_comments']: pattern = r'\\' + command s = re.sub(pattern, '%', s) s = re.sub( r'\\end{[\s]*abstract[\s]*}', '', s, flags=re.IGNORECASE ) s = re.sub( r'\\begin{[\s]*abstract[\s]*}', 'Abstract\n\n', s, flags=re.IGNORECASE ) return s def yes_or_no(s): return 1 if "Yes" == s[0:3] else 0 if "No" == s[0:2] else -1 def get_main(directory): file_paths = [] for root, _, files in os.walk(directory): for file in files: file_path = os.path.join(root, file) file_paths.append(file_path) latex_paths = [f for f in file_paths if f.endswith('.tex')] number_tex = len(latex_paths) if number_tex == 0: return None if number_tex == 1: return latex_paths[0] adjacency = np.zeros((number_tex, number_tex)) keys = [os.path.basename(path) for path in latex_paths] reg_ex = r'\\input{(.*?)}|\\include{(.*?)}|\\import{(.*?)}|\\subfile{(.*?)}|\\include[*]{(.*?)}|}' for i,file in enumerate(latex_paths): content = read_tex_file(file) find_pattern_input = re.findall(reg_ex, content) find_pattern_input = [tup for tup in find_pattern_input if not all(element == "" for element in tup)] number_matches = len(find_pattern_input) if number_matches == 0: continue else: content = replace_imports(file, content) reg_ex_clean = r'\\input{(.*?)}|\\include{(.*?)}' find_pattern_input = re.findall(reg_ex_clean, content) number_matches = len(find_pattern_input) for j in range(number_matches): match = find_pattern_input[j] non_empty_match = [t for t in match if t] for non_empty in non_empty_match: base_match = os.path.basename(non_empty) if not base_match.endswith('.tex'): base_match = base_match + '.tex' if base_match not in keys: continue ind = keys.index(base_match) adjacency[i][ind] = 1 G = nx.from_numpy_array(adjacency, create_using=nx.DiGraph) connected_components = list(nx.weakly_connected_components(G)) size_connected = [len(x) for x in connected_components] maximum_size = max(size_connected) biggest_connected = [x for x in connected_components if len(x) == maximum_size] if len(biggest_connected)>1: roots = [n for connected in biggest_connected for n in connected if not list(G.predecessors(n))] _check = [] for r in roots: try: _check.append(check_begin(latex_paths[r])) except Exception as e: _check.append(False) potentials_files = [latex_paths[x] for x, y in zip(roots, _check) if y == True] sizes_files = [os.path.getsize(x) for x in potentials_files] return potentials_files[sizes_files.index(max(sizes_files))] else: roots = [n for n in biggest_connected[0] if not list(G.predecessors(n))] return latex_paths[roots[0]] def initial_clean(directory, config): config_cmd = '' if config == True: config_cmd = '--config configs/cleaning_config.yaml' temp_dir = directory[:directory.rfind('/')] + '_temp' + '/' shutil.copytree(directory, temp_dir) try: command_res = os.system('arxiv_latex_cleaner --keep_bib {} {}'.format(directory, config_cmd)) if command_res != 0: raise Exception('Error cleaning') else: shutil.rmtree(temp_dir) except Exception as e: shutil.rmtree(directory) os.rename(temp_dir, directory) file_paths = [] for root, _, files in os.walk(directory): for file in files: file_path = os.path.join(root, file) file_paths.append(file_path) latex_paths = [f for f in file_paths if f.endswith('.tex')] for p in latex_paths: results = from_path(p) with open(p, 'w', encoding='utf-8') as f: f.write(str(results.best())) os.system('arxiv_latex_cleaner --keep_bib {} {}'.format(directory, config_cmd)) cleaned_directory = directory[:directory.rfind('/')] + '_arXiv' shutil.rmtree(directory) os.rename(cleaned_directory, directory) def check_begin(directory): content = read_tex_file(directory) english = detect(content) == 'en' return True and english if re.findall(r'\\begin{document}', content) else False def post_processing(extracted_dir, file): _dir = os.path.dirname(file) + '/' perl_expand(file) file = _dir + 'merged_latexpand.tex' try: de_macro(file) file = _dir + 'merged_latexpand-clean.tex' except Exception as e: pass try: def_handle(file) except Exception as e: pass try: declare_operator(file) # has additional add-ons except Exception as e: pass try: de_macro(file) file = _dir + os.path.splitext(os.path.basename(file))[0] + '-clean' + '.tex' except Exception as e: pass initial_clean(_dir, config=True) initial_clean(_dir, config=False) tex_content = read_tex_file(file) final_tex = reduce_spaces( delete_urls( remove_tilde( reduce_linebreaks( replace_percentage( remove_verbatim_words( tex_content ) ) ) ) ) ).strip() shutil.rmtree(extracted_dir) os.makedirs(extracted_dir) write_tex_file(extracted_dir + 'final_cleaned.tex', final_tex) initial_clean(extracted_dir, config=False) return extracted_dir + 'final_cleaned.tex' def perl_expand(file): # Save the current working directory oldpwd = os.getcwd() target_dir = os.path.dirname(file) + '/' # Correctly construct the path target = os.path.join(target_dir, 'latexpand') src = './src/utils/latexpand' # Copy the `latexpand` script to the target directory shutil.copyfile(src, target) # Change to the target directory os.chdir(target_dir) # Run the perl command without shell=True and handle redirection within Python with open('merged_latexpand.tex', 'w') as output_file: subprocess.run(['perl', 'latexpand', os.path.basename(file)], stdout=output_file, stderr=subprocess.DEVNULL) # Return to the original directory os.chdir(oldpwd) def de_macro(file): # Save the current working directory\ oldpwd = os.getcwd() target_dir = os.path.dirname(file) + '/' # Construct the target path target = os.path.join(target_dir, 'de-macro.py') src = '.src/utils/de-macro.py' # Copy the `de-macro.py` script to the target directory shutil.copyfile(src, target) # Change to the target directory os.chdir(target_dir) # Run the de-macro script without os.system and capture errors try: subprocess.run(['python3', 'de-macro.py', os.path.basename(file)], stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL, check=True) except subprocess.CalledProcessError as e: raise Exception(f"Error de-macro: {e}") from e finally: # Always return to the original directory os.chdir(oldpwd) def def_handle(file): h = os.system('python3 src/utils/def_handle.py {} --output {}'.format(file, file)) if h != 0: raise Exception('Error def handle') def declare_operator(file): s = read_tex_file(file) ## Operators pattern = r'\\DeclareMathOperator' s = re.sub(pattern, r'\\newcommand', s) pattern = { r'\\newcommand\*': r'\\newcommand', r'\\providecommand\*': r'\\newcommand', r'\\providecommand': r'\\newcommand', r'\\renewcommand\*': r'\\renewcommand', r'\\newenvironment\*': r'\\newenvironment', r'\\renewenvironment\*': r'\\renewenvironment' } s = re.sub(r'\\end +', r'\\end', s) for key in pattern: s = re.sub(key, pattern[key], s) ## Title start = '\\begin{document}' beginning_doc = s.find(start) pattern = { r'\\icmltitlerunning\*': r'\\title', r'\\icmltitlerunning': r'\\title', r'\\inlinetitle\*': r'\\title', r'\\icmltitle\*': r'\\title', r'\\inlinetitle': r'\\title', r'\\icmltitle': r'\\title', r'\\titlerunning\*': r'\\title', r'\\titlerunning': r'\\title', r'\\toctitle': r'\\title', r'\\title\*': r'\\title', r'\\TITLE\*': r'\\title', r'\\TITLE': r'\\title', r'\\Title\*': r'\\title', r'\\Title': r'\\title', } for key in pattern: s = re.sub(key, pattern[key], s) find_potential = s.find('\\title') ## Remove \\ title_content = retrieve_text(s, 'title', keep_text = True) if title_content != None: cleaned_title = re.sub(r'\\\\', ' ', title_content) cleaned_title = re.sub(r'\n',' ', cleaned_title) cleaned_title = re.sub(r'\~',' ', cleaned_title) s = s.replace(title_content, cleaned_title) if find_potential != -1 and find_potential < beginning_doc: s = s.replace('\\maketitle', cleaned_title) ## Cite and ref commands pattern = { r'\\citep\*': r'\\cite', r'\\citet\*': r'\\cite', r'\\citep': r'\\cite', r'\\citet': r'\\cite', r'\\cite\*': r'\\cite', r'\\citealt\*': r'\\cite', r'\\citealt': r'\\cite', r'\\citealtp\*': r'\\cite', r'\\citealp': r'\\cite', r'\\citeyear\*': r'\\cite', r'\\citeyear': r'\\cite', r'\\citeauthor\*': r'\\cite', r'\\citeauthor': r'\\cite', r'\\citenum\*': r'\\cite', r'\\citenum': r'\\cite', r'\\cref': r'\\ref', r'\\Cref': r'\\ref', r'\\factref': r'\\ref', r'\\appref': r'\\ref', r'\\thmref': r'\\ref', r'\\secref': r'\\ref', r'\\lemref': r'\\ref', r'\\corref': r'\\ref', r'\\eqref': r'\\ref', r'\\autoref': r'\\ref', r'begin{thm}': r'begin{theorem}', r'begin{lem}': r'begin{lemma}', r'begin{cor}': r'begin{corollary}', r'begin{exm}': r'begin{example}', r'begin{defi}': r'begin{definition}', r'begin{rem}': r'begin{remark}', r'begin{prop}': r'begin{proposition}', r'end{thm}': r'end{theorem}', r'end{lem}': r'end{lemma}', r'end{cor}': r'end{corollary}', r'end{exm}': r'end{example}', r'end{defi}': r'end{definition}', r'end{rem}': r'end{remark}', r'end{prop}': r'end{proposition}', } for key in pattern: s = re.sub(key, pattern[key], s) pattern = { r'subsubsection': r'section', r'subsubsection ': r'section', r'subsubsection\*': r'section', r'subsubsection\* ': r'section', r'subsection': r'section', r'subsection ': r'section', r'subsection\*': r'section', r'subsection\* ': r'section', r'section ': r'section', r'section\*': r'section', r'section\* ': r'section', r'chapter': r'section', r'chapter ': r'section', r'chapter\*': r'section', r'chapter\* ': r'section', r'mysubsubsection': r'section', r'mysubsection': r'section', r'mysection': r'section', } for key in pattern: s = re.sub(key, pattern[key], s) # In case any new commands for appendix/appendices s = re.sub(r'newcommand{\\appendix}', '', s) s = re.sub(r'newcommand{\\appendices}', '', s) s = get_core(s) ## In case of double titles being defined title_content = retrieve_text(s, 'title', keep_text = True) if title_content != None: cleaned_title = re.sub(r'\\\\', ' ', title_content) cleaned_title = re.sub(r'\n',' ', cleaned_title) cleaned_title = re.sub(r'\~',' ', cleaned_title) s = s.replace(title_content, cleaned_title) write_tex_file(file, s) def replace_imports(file, s): regex_p1 = r'\\import{(.*?)}{(.*?)}' s = re.sub(regex_p1, r"\\input{\1\2}", s) regex_p2 = r'\\subfile{(.*?)}' s = re.sub(regex_p2, r"\\input{\1}", s) regex_p3 = r'\\include[*]{(.*?)}' s = re.sub(regex_p3, r"\\input{\1}", s) write_tex_file(file, s) return s def remove_multargument(s, target, k): ind = s.find(target) while ind != -1: start_ind = ind + len(target) stack_open = 0 stack_close = 0 track_arg = 0 for i, char in enumerate(s[start_ind:]): if char == '{': stack_open += 1 if char == '}': stack_close += 1 if stack_open !=0 and stack_close !=0: if stack_open == stack_close: track_arg += 1 stack_open = 0 stack_close = 0 if track_arg == k: break s = s[:ind] + s[start_ind + i + 1:] ind = s.find(target) return s def fix_citations(s): pattern = { r'\\citep\*': r'\\cite', r'\\citet\*': r'\\cite', r'\\citep': r'\\cite', r'\\citet': r'\\cite', r'\\cite\*': r'\\cite', r'\\citealt\*': r'\\cite', r'\\citealt': r'\\cite', r'\\citealtp\*': r'\\cite', r'\\citealp': r'\\cite', r'\\citeyear\*': r'\\cite', r'\\citeyear': r'\\cite', r'\\citeauthor\*': r'\\cite', r'\\citeauthor': r'\\cite', r'\\citenum\*': r'\\cite', r'\\citenum': r'\\cite' } for key in pattern: s = re.sub(key, pattern[key], s) return s def find_bib(directory): file_paths = [] for root, _, files in os.walk(directory): for file in files: file_path = os.path.join(root, file) file_paths.append(file_path) bib_paths = [f for f in file_paths if f.endswith('.bib')] return bib_paths def create_bib_from_bbl(bibfile): with open(bibfile, 'r') as f: content = f.read() library_raw = bibtexparser.parse_string(content) library = {} for block in library_raw.blocks: if isinstance( block, (bibtexparser.model.DuplicateBlockKeyBlock, bibtexparser.model.ParsingFailedBlock, bibtexparser.model.ImplicitComment) ): continue fields = {} for field in block.fields: fields[field.key] = field.value ## Get a good title one ## field_content = fields["note"] field_content = field_content.replace("\n", " ") field_content = re.sub(" +", " ", field_content) if field_content.find("``") != -1 and field_content.find("\'\'") != -1: title = ( field_content[field_content.find("``") + 2 : field_content.find("\'\'")] .replace("\\emph", "") .replace("\\emp", "") .replace("\\em", "") .replace(",", "") .replace("{", "") .replace("}","") .replace("``", "") .replace("\'\'", "") .strip(".") .strip() .strip(".") .lower() ) fields['title'] = title else: if field_content.count("\\newblock") == 2: field_content = field_content.replace("\\newblock", "``", 1) field_content = field_content.replace("\\newblock", "\'\'", 1) if field_content.find("``") != -1 and field_content.find("\'\'") != -1: title = ( field_content[field_content.find("``") + 2 : field_content.find("\'\'")] .replace("\\emph", "") .replace("\\emp", "") .replace("\\em", "") .replace(",", "") .replace("{", "") .replace("}","") .replace("``", "") .replace("\'\'", "") .strip(".") .strip() .strip(".") .lower() ) fields['title'] = title library[block.key] = fields return library def create_bib(bibfile): with open(bibfile, 'r') as f: content = f.read() library_raw = bibtexparser.parse_string(content) library = {} for block in library_raw.blocks: if isinstance( block, (bibtexparser.model.DuplicateBlockKeyBlock, bibtexparser.model.ParsingFailedBlock, bibtexparser.model.ImplicitComment) ): continue fields = {} for field in block.fields: fields[field.key] = field.value.replace('{', '').replace('}', '') if field.key == 'title': title = re.sub(r'[\n]+', ' ', field.value) # keep only one \n title = re.sub(r' +', ' ', title) fields[field.key] = ( title.replace("\\emph", "") .replace("\\emp", "") .replace("\\em", "") .replace(",", "") .replace("{", "") .replace("}", "") .strip(".") .strip() .strip(".") .lower() ) if 'title' not in fields: continue library[block.key] = fields return library def find_bbl(directory): file_paths = [] for root, _, files in os.walk(directory): for file in files: file_path = os.path.join(root, file) file_paths.append(file_path) bib_paths = [f for f in file_paths if f.endswith('.bbl')] return bib_paths def textobib(file): oldpwd = os.getcwd() target_dir = os.path.dirname(file) + '/' target = target_dir + 'tex2bib' src = './tex2bib' shutil.copyfile(src, target) os.chdir(target_dir) output_file = os.path.splitext(os.path.basename(file))[0] + '.bib' os.system('perl tex2bib -i {} -o {}'.format(os.path.basename(file), output_file)) os.chdir(oldpwd) return target_dir + output_file def get_library_bib(bib_files): library = [] for bib_file in bib_files: library.append(create_bib(bib_file)) final_library = {} for d in library: final_library.update(d) return final_library def get_library_bbl(bbl_files): bib_files = [] for bbl_file in bbl_files: bib_files.append(textobib(bbl_file)) library = [] for bib_file in bib_files: library.append(create_bib_from_bbl(bib_file)) final_library = {} for d in library: final_library.update(d) return final_library