dkoshman commited on
Commit
8ab1767
1 Parent(s): ae308b4

trying to make data generator work from remote

Browse files
Files changed (2) hide show
  1. data_generator.py +35 -17
  2. data_preprocessing.py +1 -1
data_generator.py CHANGED
@@ -4,6 +4,7 @@ import os
4
  import string
5
  import subprocess
6
  import random
 
7
 
8
 
9
  class DotDict(dict):
@@ -101,28 +102,32 @@ def generate_equation(latex: DotDict, size, depth=3):
101
  return equation
102
 
103
 
104
- def generate_image(directory: str, latex_path: str, filename: str, max_length=20):
 
 
 
105
  """
106
  Generates a random tex file and corresponding image
107
  -------
108
  params:
109
  :directory: -- dir where to save files
110
- :latex_dir: -- path to latex json
111
- :filename: -- name for the generated files
112
  :max_length: -- max size of equation
 
 
 
113
  """
114
- # TODO ARGPARSE, path parse
115
- filepath = directory + filename
116
 
117
- with open(latex_path) as file:
118
- latex = json.load(file)
119
- latex = DotDict(latex)
120
 
121
  template = string.Template(latex.template)
122
  font, font_options = random.choice(latex.fonts)
123
  font_option = random.choice([''] + font_options)
124
  fontsize = random.choice(latex.fontsizes)
125
- equation = generate_equation(latex, max_length)
126
  tex = template.substitute(font=font, font_option=font_option, fontsize=fontsize, equation=equation)
127
 
128
  files_before = set(os.listdir(directory))
@@ -130,7 +135,7 @@ def generate_image(directory: str, latex_path: str, filename: str, max_length=20
130
  file.write(tex)
131
 
132
  pr1 = subprocess.run(
133
- f"pdflatex -output-directory={directory} {filepath}.tex".split(),
134
  stderr=subprocess.PIPE,
135
  )
136
 
@@ -138,23 +143,23 @@ def generate_image(directory: str, latex_path: str, filename: str, max_length=20
138
  if pr1.returncode != 0:
139
  files_to_delete = files_after - files_before
140
  if files_to_delete:
141
- subprocess.run(['rm'] + [directory + file for file in files_to_delete])
142
  print(pr1.stderr.decode(), tex)
143
  return
144
 
145
  pr2 = subprocess.run(
146
- f"gs -sDEVICE=png16m -dTextAlphaBits=4 -r200 -dSAFER -dBATCH -dNOPAUSE -o {filepath}.png {filepath}.pdf".split(),
147
  stderr=subprocess.PIPE,
148
  )
149
 
150
  files_to_delete = files_after - files_before - {filename + '.png', filename + '.tex'}
151
  if files_to_delete:
152
- subprocess.run(['rm'] + [directory + file for file in files_to_delete])
153
  assert (pr2.returncode == 0)
154
 
155
 
156
  def generate_data(
157
- filenames: iter(str),
158
  directory: str,
159
  latex_path: str,
160
  overwrite: bool = False
@@ -168,14 +173,27 @@ def generate_data(
168
  :latex_path: - full path to latex json
169
  :overwrite: - whether to overwrite existing files
170
  """
 
 
 
 
 
171
 
172
  filenames = set(filenames)
173
  if not overwrite:
174
- existing = set(file.split('.')[0] for file in os.listdir(directory) if file.endswith('.png'))
 
 
175
  filenames -= existing
176
 
 
 
 
 
177
  while filenames:
178
- with Pool() as pool:
179
- pool.starmap(generate_image, ((directory, latex_path, name) for name in filenames))
 
 
180
  existing = set(file.split('.')[0] for file in os.listdir(directory) if file.endswith('.png'))
181
  filenames -= existing
 
4
  import string
5
  import subprocess
6
  import random
7
+ from typing import Iterable
8
 
9
 
10
  class DotDict(dict):
 
102
  return equation
103
 
104
 
105
+ def generate_image(directory: str, latex: DotDict, filename: str, max_length=20, equation_depth=3,
106
+ pdflatex: str = "/external2/dkkoshman/venv/texlive/2022/bin/x86_64-linux/pdflatex",
107
+ ghostscript: str = "/external2/dkkoshman/venv/local/gs/bin/gs"
108
+ ):
109
  """
110
  Generates a random tex file and corresponding image
111
  -------
112
  params:
113
  :directory: -- dir where to save files
114
+ :latex: -- DotDict with parameters to generate tex
115
+ :filename: -- absolute filename for the generated files
116
  :max_length: -- max size of equation
117
+ :equation_depth: -- max nested level of tex scopes
118
+ :pdflatex: -- path to pdflatex
119
+ :ghostscript: -- path to ghostscript
120
  """
121
+ # TODO ARGPARSE
122
+ filepath = os.path.join(directory, filename)
123
 
124
+ equation_length = random.randint(1, max_length)
 
 
125
 
126
  template = string.Template(latex.template)
127
  font, font_options = random.choice(latex.fonts)
128
  font_option = random.choice([''] + font_options)
129
  fontsize = random.choice(latex.fontsizes)
130
+ equation = generate_equation(latex, equation_length, depth=equation_depth)
131
  tex = template.substitute(font=font, font_option=font_option, fontsize=fontsize, equation=equation)
132
 
133
  files_before = set(os.listdir(directory))
 
135
  file.write(tex)
136
 
137
  pr1 = subprocess.run(
138
+ f"{pdflatex} -output-directory={directory} {filepath}.tex".split(),
139
  stderr=subprocess.PIPE,
140
  )
141
 
 
143
  if pr1.returncode != 0:
144
  files_to_delete = files_after - files_before
145
  if files_to_delete:
146
+ subprocess.run(['rm'] + [os.path.join(directory, file) for file in files_to_delete])
147
  print(pr1.stderr.decode(), tex)
148
  return
149
 
150
  pr2 = subprocess.run(
151
+ f"{ghostscript} -sDEVICE=png16m -dTextAlphaBits=4 -r200 -dSAFER -dBATCH -dNOPAUSE -o {filepath}.png {filepath}.pdf".split(),
152
  stderr=subprocess.PIPE,
153
  )
154
 
155
  files_to_delete = files_after - files_before - {filename + '.png', filename + '.tex'}
156
  if files_to_delete:
157
+ subprocess.run(['rm'] + [os.path.join(directory, file) for file in files_to_delete])
158
  assert (pr2.returncode == 0)
159
 
160
 
161
  def generate_data(
162
+ filenames: Iterable[str],
163
  directory: str,
164
  latex_path: str,
165
  overwrite: bool = False
 
173
  :latex_path: - full path to latex json
174
  :overwrite: - whether to overwrite existing files
175
  """
176
+ subprocess.run(". /external2/dkkoshman/venv/bin/activate")
177
+ if not os.path.isabs(directory):
178
+ directory = os.path.join(os.getcwd(), directory)
179
+ if not os.path.isabs(latex_path):
180
+ latex_path = os.path.join(os.getcwd(), latex_path)
181
 
182
  filenames = set(filenames)
183
  if not overwrite:
184
+ existing = set(
185
+ filename for file in os.listdir(directory) for filename, ext in os.path.splitext(file) if ext == '.png'
186
+ )
187
  filenames -= existing
188
 
189
+ with open(latex_path) as file:
190
+ latex = json.load(file)
191
+ latex = DotDict(latex)
192
+
193
  while filenames:
194
+ for name in filenames:
195
+ generate_image(directory, latex, name)
196
+ # with Pool() as pool:
197
+ # pool.starmap(generate_image, ((directory, latex, name) for name in filenames))
198
  existing = set(file.split('.')[0] for file in os.listdir(directory) if file.endswith('.png'))
199
  filenames -= existing
data_preprocessing.py CHANGED
@@ -23,7 +23,7 @@ class TexImageDataset(Dataset):
23
  torch.multiprocessing.set_sharing_strategy('file_system')
24
  self.root_dir = root_dir
25
  self.filenames = sorted(set(
26
- os.path.splitext(filename)[0] for filename in os.listdir(root_dir) if filename.endswith('png')
27
  ))
28
  self.image_transform = image_transform
29
  self.tex_transform = tex_transform
 
23
  torch.multiprocessing.set_sharing_strategy('file_system')
24
  self.root_dir = root_dir
25
  self.filenames = sorted(set(
26
+ filename for file in os.listdir(root_dir) for filename, ext in os.path.splitext(file) if ext == '.png'
27
  ))
28
  self.image_transform = image_transform
29
  self.tex_transform = tex_transform