Gabor Cselle commited on
Commit
41d52be
1 Parent(s): d0f419a

Font allowlist, deal with .ttc files, let's just generate 10 per font

Browse files
Files changed (2) hide show
  1. .gitignore +2 -1
  2. gen_sample_data.py +36 -22
.gitignore CHANGED
@@ -1 +1,2 @@
1
- font_images
 
 
1
+ font_images
2
+ .DS_Store
gen_sample_data.py CHANGED
@@ -1,7 +1,6 @@
1
  # Generate sample data with 800x400 images of fonts in /System/Library/Fonts
2
  # 50 images per font, 1 font per image
3
 
4
-
5
  import os
6
  from PIL import Image, ImageDraw, ImageFont
7
  import nltk
@@ -14,12 +13,16 @@ nltk.download('brown')
14
  # Sample text for prose and code
15
  prose_text = " ".join(brown.words(categories='news')[:50]) # First 50 words from news category
16
 
17
- font_dir = '/System/Library/Fonts/'
 
18
  output_dir = './font_images'
19
  os.makedirs(output_dir, exist_ok=True)
20
 
21
  all_brown_words = sorted(set(brown.words(categories='news')))
22
 
 
 
 
23
  def wrap_text(text, line_length=10):
24
  """
25
  Wraps the provided text every 'line_length' words.
@@ -35,27 +38,38 @@ def random_code_text(base_code, num_lines=15): # Increase number of lines
35
  lines = base_code.split("\n")
36
  return "\n".join(random.sample(lines, min(num_lines, len(lines))))
37
 
38
- for font_file in os.listdir(font_dir):
39
- if font_file.endswith('.ttf'):
40
- font_path = os.path.join(font_dir, font_file)
41
- font_name = font_file.split('.')[0]
42
- print(font_name)
43
-
44
- j = 0
45
- for i in range(50): # Generate 50 images per font
46
- prose_sample = random_prose_text(all_brown_words)
47
 
48
- for text in [prose_sample]:
49
- img = Image.new('RGB', (800, 400), color="white") # Canvas size
50
- draw = ImageDraw.Draw(img)
 
 
51
  font_size = random.choice(range(32, 128)) # Increased minimum font size
52
  font = ImageFont.truetype(font_path, font_size)
53
 
54
- # Random offsets, but ensuring that text isn't too far off the canvas
55
- offset_x = random.randint(-20, 10)
56
- offset_y = random.randint(-20, 10)
57
- draw.text((offset_x, offset_y), text, fill="black", font=font)
58
-
59
- j += 1
60
- output_file = os.path.join(output_dir, f"{font_name}_{j}.png")
61
- img.save(output_file)
 
 
 
 
 
 
 
 
 
 
1
  # Generate sample data with 800x400 images of fonts in /System/Library/Fonts
2
  # 50 images per font, 1 font per image
3
 
 
4
  import os
5
  from PIL import Image, ImageDraw, ImageFont
6
  import nltk
 
13
  # Sample text for prose and code
14
  prose_text = " ".join(brown.words(categories='news')[:50]) # First 50 words from news category
15
 
16
+ # Note that this will only work on MacOS where this is the default font directory
17
+ font_dirs = ['/System/Library/Fonts/', '/System/Library/Fonts/Supplemental/']
18
  output_dir = './font_images'
19
  os.makedirs(output_dir, exist_ok=True)
20
 
21
  all_brown_words = sorted(set(brown.words(categories='news')))
22
 
23
+ # This is a list of fonts that we want to use for our sample data
24
+ FONT_ALLOWLIST = ["Arial", "Avenir", "Courier", "Helvetica", "Georgia", "Tahoma", "Times New Roman", "Verdana"]
25
+
26
  def wrap_text(text, line_length=10):
27
  """
28
  Wraps the provided text every 'line_length' words.
 
38
  lines = base_code.split("\n")
39
  return "\n".join(random.sample(lines, min(num_lines, len(lines))))
40
 
41
+ for font_dir in font_dirs:
42
+ for font_file in os.listdir(font_dir):
43
+ if font_file.endswith('.ttf') or font_file.endswith('.ttc'):
44
+ font_path = os.path.join(font_dir, font_file)
45
+ font_name = font_file.split('.')[0]
46
+ if font_name not in FONT_ALLOWLIST:
47
+ continue
48
+ # Output the font name so we can see the progress
49
+ print(font_path, font_name)
50
 
51
+ if font_file.endswith('.ttc'):
52
+ # ttc fonts have multiple fonts in one file, so we need to specify which one we want
53
+ font = ImageFont.truetype(font_path, random.choice(range(32, 128)), index=0)
54
+ else:
55
+ # ttf fonts have only one font in the file
56
  font_size = random.choice(range(32, 128)) # Increased minimum font size
57
  font = ImageFont.truetype(font_path, font_size)
58
 
59
+ # Counter for the image filename
60
+ j = 0
61
+ for i in range(10): # Generate 50 images per font - reduced to 10 for now to make things faster
62
+ prose_sample = random_prose_text(all_brown_words)
63
+
64
+ for text in [prose_sample]:
65
+ img = Image.new('RGB', (800, 400), color="white") # Canvas size
66
+ draw = ImageDraw.Draw(img)
67
+
68
+ # Random offsets, but ensuring that text isn't too far off the canvas
69
+ offset_x = random.randint(-20, 10)
70
+ offset_y = random.randint(-20, 10)
71
+ draw.text((offset_x, offset_y), text, fill="black", font=font)
72
+
73
+ j += 1
74
+ output_file = os.path.join(output_dir, f"{font_name}_{j}.png")
75
+ img.save(output_file)