Spaces:
Runtime error
Runtime error
File size: 6,231 Bytes
02f3832 6e82d4a 02f3832 6e82d4a 02f3832 6e82d4a 02f3832 6e82d4a 02f3832 6e82d4a 02f3832 6e82d4a 02f3832 6e82d4a 02f3832 6e82d4a 02f3832 6e82d4a 02f3832 6e82d4a 02f3832 6e82d4a 02f3832 6e82d4a 02f3832 6e82d4a 02f3832 6e82d4a 02f3832 6e82d4a 02f3832 6e82d4a 02f3832 6e82d4a 02f3832 6e82d4a 02f3832 6e82d4a 02f3832 6e82d4a 02f3832 6e82d4a 02f3832 6e82d4a 02f3832 6e82d4a 02f3832 6e82d4a 02f3832 6e82d4a 02f3832 6e82d4a 02f3832 6e82d4a 02f3832 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 |
import json
from multiprocessing import Pool
import os
import string
import subprocess
import random
class DotDict(dict):
"""dot.notation access to dictionary attributes"""
__getattr__ = dict.get
__setattr__ = dict.__setitem__
__delattr__ = dict.__delitem__
def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
if len(args) > 0 and isinstance(args[0], dict):
for key, value in self.items():
if isinstance(value, dict):
self.__setitem__(key, DotDict(value))
def _generate_equation(size_left, depth_left, latex, tokens):
if size_left <= 0:
return ""
equation = ""
pairs, scopes, special = latex.pairs, latex.scopes, latex.special
weights = [3, depth_left > 0, depth_left > 0]
group, = random.choices([tokens, pairs, scopes], weights=weights)
if group is tokens:
equation += ' '.join([
random.choice(tokens),
_generate_equation(size_left - 1, depth_left, latex, tokens)
])
return equation
post_scope_size = round(abs(random.gauss(0, size_left / 2)))
size_left -= post_scope_size + 1
if group is pairs:
pair = random.choice(pairs)
equation += ' '.join([
pair[0],
_generate_equation(size_left, depth_left - 1, latex, tokens),
pair[1],
_generate_equation(post_scope_size, depth_left, latex, tokens)
])
return equation
elif group is scopes:
scope_type, scope_group = random.choice(list(scopes.items()))
scope_operator = random.choice(scope_group)
equation += scope_operator
if scope_type == 'single':
equation += ' '.join([
special.left_bracket,
_generate_equation(size_left, depth_left - 1, latex, tokens)
])
elif scope_type == 'double_no_delimiters':
equation += ' '.join([
special.left_bracket,
_generate_equation(size_left // 2, depth_left - 1, latex, tokens),
special.right_bracket + special.left_bracket,
_generate_equation(size_left // 2, depth_left - 1, latex, tokens)
])
elif scope_type == 'double_with_delimiters':
equation += ' '.join([
special.caret,
special.left_bracket,
_generate_equation(size_left // 2, depth_left - 1, latex, tokens),
special.right_bracket,
special.underscore,
special.left_bracket,
_generate_equation(size_left // 2, depth_left - 1, latex, tokens)
])
equation += ' '.join([
special.right_bracket,
_generate_equation(post_scope_size, depth_left, latex, tokens)
])
return equation
def generate_equation(latex: DotDict, size, depth=3):
"""
Generates a random latex equation
-------
params:
:latex: -- dict with tokens to generate equation from
:size: -- approximate size of equation
:depth: -- max brackets and scope depth
"""
tokens = [token for group in ['chars', 'greek', 'functions', 'operators', 'spaces']
for token in latex[group]]
equation = _generate_equation(size, depth, latex, tokens)
return equation
def generate_image(directory: str, latex_path: str, filename: str, max_length=20):
"""
Generates a random tex file and corresponding image
-------
params:
:directory: -- dir where to save files
:latex_dir: -- path to latex json
:filename: -- name for the generated files
:max_length: -- max size of equation
"""
# TODO ARGPARSE, path parse
filepath = directory + filename
with open(latex_path) as file:
latex = json.load(file)
latex = DotDict(latex)
template = string.Template(latex.template)
font, font_options = random.choice(latex.fonts)
font_option = random.choice([''] + font_options)
fontsize = random.choice(latex.fontsizes)
equation = generate_equation(latex, max_length)
tex = template.substitute(font=font, font_option=font_option, fontsize=fontsize, equation=equation)
files_before = set(os.listdir(directory))
with open(f"{filepath}.tex", mode='w') as file:
file.write(tex)
pr1 = subprocess.run(
f"pdflatex -output-directory={directory} {filepath}.tex".split(),
stderr=subprocess.PIPE,
)
files_after = set(os.listdir(directory))
if pr1.returncode != 0:
files_to_delete = files_after - files_before
if files_to_delete:
subprocess.run(['rm'] + [directory + file for file in files_to_delete])
print(pr1.stderr.decode(), tex)
return
pr2 = subprocess.run(
f"gs -sDEVICE=png16m -dTextAlphaBits=4 -r200 -dSAFER -dBATCH -dNOPAUSE -o {filepath}.png {filepath}.pdf".split(),
stderr=subprocess.PIPE,
)
files_to_delete = files_after - files_before - {filename + '.png', filename + '.tex'}
if files_to_delete:
subprocess.run(['rm'] + [directory + file for file in files_to_delete])
assert (pr2.returncode == 0)
def generate_dataset(
filenames: iter(str),
directory: str = "/external2/dkkoshman/repos/ML2TransformerApp/data/",
latex_path: str = "/external2/dkkoshman/repos/ML2TransformerApp/resources/latex.json",
overwrite: bool = False
) -> None:
"""
Generates a latex dataset in given directory
-------
params:
:filenames: - iterable of filenames to create, without extension
:directory: - where to create
:latex_path: - full path to latex json
:overwrite: - whether to overwrite existing files
"""
filenames = set(filenames)
if not overwrite:
existing = set(file.split('.')[0] for file in os.listdir(directory) if file.endswith('.png'))
filenames -= existing
while filenames:
with Pool() as pool:
pool.starmap(generate_image, ((directory, latex_path, name) for name in filenames))
existing = set(file.split('.')[0] for file in os.listdir(directory) if file.endswith('.png'))
filenames -= existing
|