volrath50 commited on
Commit
fef304e
1 Parent(s): 4eeaedb

Upload create-mtg-training-data.py

Browse files
Files changed (1) hide show
  1. create-mtg-training-data.py +388 -0
create-mtg-training-data.py ADDED
@@ -0,0 +1,388 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #a program to build a training data set for Stable Diffusion for Magic: The Gathering
2
+ #it will pull all data from a saved JSON file, parse it, and save an image file and corresponding text file for each card
3
+
4
+ import requests
5
+ import json
6
+ import os
7
+ import random
8
+ import time
9
+
10
+
11
+ #set up the directory to save the images and data to
12
+ directory = './mtg-training-data/' #this is the directory where the images and data will be saved
13
+ if not os.path.exists(directory):
14
+ os.makedirs(directory) #make the directory if it doesn't exist already
15
+
16
+ #our json file location
17
+ json_filename = './mtg-training-data/mtgdata.json'
18
+
19
+ #save a normal, single-faced card
20
+ def save_card_main(card):
21
+ #get the image url and filename
22
+ img_url = card['image_uris'].get('art_crop')
23
+ filename = which_filename(card)
24
+ #clean up any characters that might make the filename invalid
25
+ filename = filename.replace('/', '-')
26
+ filename = filename.replace(':', '-')
27
+ filename = filename.replace('?', '')
28
+ filename = filename.replace('!', '')
29
+ filename = filename.replace('\'', '-')
30
+ filename = filename.replace('\"', '')
31
+ filename = filename.replace(' ', '_')
32
+ filename = filename.replace('(', '')
33
+ filename = filename.replace(')', '')
34
+ filename = filename.replace(',', '')
35
+ filename = filename.replace('’', '')
36
+ filename = filename.replace('–', '-')
37
+ filename = filename.replace('—', '-')
38
+ filename = filename.replace('…', '')
39
+ #save the image unless it already exists
40
+ if not os.path.isfile('{}{}'.format(directory, filename)):
41
+ #add a 0.1 second delay if we get blocked by the server
42
+ #time.sleep(0.1)
43
+ save_image(img_url, filename)
44
+ #get the data
45
+ data = return_data(card)
46
+ #save the data
47
+ save_data(data, filename)
48
+
49
+
50
+ #determine the filename we're saving the image to; format is cardname_setcode_illustration_id.jpg
51
+ def which_filename(card):
52
+ if card.get('flavor_name'):
53
+ name = card['flavor_name']
54
+ #if there's no illustration_id, use multiverse_ids as a backup
55
+ if card.get('illustration_id'):
56
+ id = card['illustration_id']
57
+ else:
58
+ id = card['multiverse_ids'][0]
59
+ id = str(id)
60
+ filename = card['name'] + '_' + card['set'] + '_' + id + '.jpg'
61
+ return filename
62
+
63
+ #save the image
64
+ def save_image(img_url, filename):
65
+ response = requests.get(img_url)
66
+ with open('{}{}'.format(directory, filename), 'wb') as f:
67
+ f.write(response.content)
68
+ report to stdout that we just saved the url to the filename
69
+ print('Saved {} to {}'.format(img_url, filename))
70
+
71
+
72
+ #save the data as a text file with the same name as the image
73
+ def save_data(data, filename):
74
+ filename = filename.replace('.jpg', '.txt')
75
+ with open('{}{}'.format(directory, filename), 'w', encoding="utf8") as f:
76
+ f.write(str(data))
77
+ #report to stdout what we just saved and to what filename
78
+ print('Saved {} to {}'.format(data, filename))
79
+
80
+ #get the data we want from the card and return it
81
+ #from the json, we're getting card name, artist, the year of release, color, type, rarity, set name, set code, mana cost, any watermarks, power/toughness, any keywords, universes beyond info
82
+ def return_data(card):
83
+ glue = 'MTG card art'
84
+ name = card['name']
85
+ #look for a flavor name, change name to flavor name if it exists
86
+ if card.get('flavor_name'):
87
+ name = card['flavor_name']
88
+ #replace ' with ’ in name for placeholder purposes
89
+ name = name.replace('\'', '’')
90
+ artist = 'by ' + card['artist']
91
+ #replace ' with ’ in artist for placeholder purposes
92
+ artist = artist.replace('\'', '’')
93
+ year = get_year(card['released_at'])
94
+ if card.get('colors'):
95
+ colors_english = mtg_color_to_english(card['colors'])
96
+ colors = clean_colors(card['colors'])
97
+ else:
98
+ colors_english = 'Colorless'
99
+ colors = 'C'
100
+ card_type = card['type_line']
101
+ rarity = card['rarity']
102
+ set_name = card['set_name']
103
+ set_code = ' '
104
+ set_code = card['set']
105
+ set_type = card['set_type']
106
+ plane = ' '
107
+ plane = determine_plane(set_code)
108
+ if card.get('watermark'):
109
+ watermark = card['watermark']
110
+ else: watermark = 'WATERMARK_PLACEHOLDER'
111
+ mana_cost = clean_data(card['mana_cost'])
112
+ if card.get('security_stamp'):
113
+ security_stamp = parse_security_stamp(card['security_stamp'])
114
+ else: security_stamp = 'SECURITY_STAMP_PLACEHOLDER'
115
+ if card.get('power'):
116
+ power = card['power']
117
+ else: power = 'POWER_PLACEHOLDER'
118
+ if card.get('toughness'):
119
+ toughness = card['toughness']
120
+ else: toughness = 'TOUGHNESS_PLACEHOLDER'
121
+ power_toughness = power + '/' + toughness
122
+ if card.get('keywords'):
123
+ keywords = card['keywords']
124
+ else: keywords = 'KEYWORDS_PLACEHOLDER'
125
+ if card.get('promo_types'):
126
+ promo_types = card['promo_types']
127
+ else: promo_types = 'PROMO_TYPES_PLACEHOLDER'
128
+ if card.get('story_spotlight'):
129
+ story_spotlight = card['story_spotlight']
130
+ else: story_spotlight = 'SPOTLIGHT_PLACEHOLDER'
131
+ #at one point I put extra spaces in as a quick fix to make the txt files come out right - I have no idea if they are still needed
132
+ extra_space = ' '
133
+ #put it all together
134
+ data = glue, name, artist, year, colors_english, colors, card_type, rarity, set_name, set_code, plane, set_type, watermark, extra_space, mana_cost, extra_space, security_stamp, power_toughness, keywords, promo_types, story_spotlight
135
+ #then clean it up
136
+ data = clean_data(data)
137
+ return data
138
+
139
+ #determine the plane from the set code
140
+ def determine_plane(set_code):
141
+ #convert set_code to uppercase
142
+ set_code = set_code.upper()
143
+ if set_code == 'ALA' or set_code == 'CON' or set_code == 'ARB':
144
+ plane = 'Alara'
145
+ elif set_code == 'AKH' or set_code == 'HOU':
146
+ plane = 'Amonkhet'
147
+ elif set_code == 'STX':
148
+ plane = 'Arcavios'
149
+ elif set_code == 'SNC':
150
+ plane = 'Capenna'
151
+ elif set_code == 'ELD':
152
+ plane = 'Eldraine'
153
+ elif set_code == 'CNS' or set_code == 'CN2':
154
+ plane = 'Fiora'
155
+ elif set_code == 'IKO':
156
+ plane = 'Ikoria'
157
+ elif set_code == 'ISD' or set_code == 'DKA' or set_code == 'AVR' or set_code == 'SOI' or set_code == 'EMN' or set_code == 'MID' or set_code == 'VOW':
158
+ plane = 'Innistrad'
159
+ elif set_code == 'IXL' or set_code == 'RIX':
160
+ plane = 'Ixalan'
161
+ elif set_code == 'KLD' or set_code == 'AER':
162
+ plane = 'Kaladesh'
163
+ elif set_code == 'KHM':
164
+ plane = 'Kaldheim'
165
+ elif set_code == 'SOK' or set_code == 'BOK' or set_code == 'CHK':
166
+ plane = 'Kamigawa, past'
167
+ elif set_code == 'NEC':
168
+ plane = 'Kamigawa, present'
169
+ #for some reason Kylem causes the script to crash. I have no idea why. I got tired of trying to figure it out, and it's 2am, so fuck Kylem
170
+ #elif set_code == 'BBD':
171
+ # plane == 'Kylem'
172
+ elif set_code == 'LOR' or set_code == 'MOR':
173
+ plane = 'Lorwyn, LorShad'
174
+ elif set_code == 'SHM' or set_code == 'EVE':
175
+ plane = 'Shadowmoor, LorShad'
176
+ elif set_code == 'MMQ':
177
+ plane = 'Mercadia'
178
+ elif set_code == 'NPH':
179
+ plane = 'New Phyrexia'
180
+ elif set_code == 'MRD' or set_code == 'DST' or set_code == '5DN' or set_code == 'SOM' or set_code == 'MBS':
181
+ plane = 'Mirrodin'
182
+ elif set_code == 'ARN':
183
+ plane = 'Rabiah'
184
+ elif set_code == 'TMP' or set_code == 'STH' or set_code == 'EXO' or set_code == 'NEM':
185
+ plane = 'Rath'
186
+ elif set_code == 'RAV' or set_code == 'GPT' or set_code == 'DIS' or set_code == 'RTR' or set_code == 'GTC' or set_code == 'DGM' or set_code == 'GRN' or set_code == 'RNA' or set_code == 'WAR':
187
+ plane = 'Ravnica'
188
+ elif set_code == 'M10' or set_code == 'M11' or set_code == 'M12' or set_code == 'M13' or set_code == 'M14' or set_code == 'M15':
189
+ plane = 'Shandalar'
190
+ elif set_code == "KTK" or set_code == "FRF" or set_code == "DTK":
191
+ plane = 'Tarkir'
192
+ elif set_code == 'THS' or set_code == 'BNG' or set_code == 'JOU' or set_code == 'THB':
193
+ plane = 'Theros'
194
+ elif set_code == 'HML':
195
+ plane = 'Ulgrotha'
196
+ elif set_code == 'ZEN' or set_code == 'WWK' or set_code == 'ROE' or set_code == 'BFZ' or set_code == 'OGW' or set_code == 'ZNR':
197
+ plane = 'Zendikar'
198
+
199
+ #Dominaria
200
+ elif set_code == 'LEA' or set_code == 'LEG' or set_code == 'DOM' or set_code == 'DMU':
201
+ plane = 'Dominaria'
202
+ elif set_code == 'ATQ' or set_code == 'DRK':
203
+ plane = 'Dominaria, Terisiare'
204
+ elif set_code == 'ICE' or set_code == 'ALL' or set_code == 'CSP':
205
+ plane = 'Dominaria, Terisiare, Ice Age'
206
+ elif set_code == 'BRO':
207
+ plane = 'Dominaria, past'
208
+ elif set_code == 'MIR' or set_code == 'VIS' or set_code == 'WTH' or set_code == 'PCY':
209
+ plane = 'Dominaria, Jamuraa'
210
+ elif set_code == 'TSP' or set_code == 'PLC' or set_code == 'FUT':
211
+ plane = 'Dominaria, Time Spiral'
212
+ elif set_code == 'INV' or set_code == 'PLS' or set_code == 'APC':
213
+ plane = 'Dominaria, Phyrexian Invasion'
214
+ elif set_code == 'ODY' or set_code == 'TOR' or set_code == 'JUD' or set_code == 'ONS' or set_code == 'LGN' or set_code == 'SCG':
215
+ plane = 'Dominaria, Otaria'
216
+ else:
217
+ plane = ' '
218
+ return plane
219
+
220
+ #clean the colors up, remove the square brackets, quotes, commas and spaces
221
+ def clean_colors(colors):
222
+ colors = str(colors)
223
+ colors = colors.replace('[', '')
224
+ colors = colors.replace(']', '')
225
+ colors = colors.replace('\'', '')
226
+ colors = colors.replace(',', '')
227
+ colors = colors.replace(' ', '')
228
+ #then add a space back at the start
229
+ #now rearrange the colors so they're in WUBRG order until there are no more rearrangements to be made
230
+ while True:
231
+ colors = colors.replace('UW', 'WU')
232
+ colors = colors.replace('BW', 'WB')
233
+ colors = colors.replace('RW', 'WR')
234
+ colors = colors.replace('GW', 'WG')
235
+ colors = colors.replace('BU', 'UB')
236
+ colors = colors.replace('GU', 'UG')
237
+ colors = colors.replace('RU', 'UR')
238
+ colors = colors.replace('RB', 'BR')
239
+ colors = colors.replace('GB', 'BG')
240
+ colors = colors.replace('GR', 'RG')
241
+ if 'UW' not in colors and 'BW' not in colors and 'RW' not in colors and 'GW' not in colors and 'BU' not in colors and 'GU' not in colors and 'RU' not in colors and 'RB' not in colors and 'GB' not in colors and 'GR' not in colors:
242
+ break
243
+ return colors
244
+ #figure out what the security stamp is telling us
245
+ def parse_security_stamp(security_stamp):
246
+ if security_stamp:
247
+ if security_stamp == 'acorn':
248
+ return 'acorn'
249
+ if security_stamp == 'triangle':
250
+ return 'Universes Beyond'
251
+ if security_stamp == 'arena':
252
+ return 'Arena'
253
+ if security_stamp == 'circle':
254
+ ##return 'Signature Spellbook'
255
+ #return nothing for now
256
+ return None
257
+ if security_stamp == 'heart':
258
+ return 'My Little Pony'
259
+ if security_stamp == 'oval':
260
+ return None
261
+ else: return None
262
+
263
+
264
+ #remove all characters other than WUBRG, then convert letter by letter
265
+ def mtg_color_to_english(colors):
266
+ #remove all characters other than WUBRG
267
+ colors = ''.join([c for c in colors if c in 'WUBRG'])
268
+ #for each letter in colors, add the corresponding color to colors_english, plus a comma and space
269
+ colors_english = ''
270
+ for c in colors:
271
+ if c == 'W':
272
+ colors_english += 'White, '
273
+ if c == 'U':
274
+ colors_english += 'Blue, '
275
+ if c == 'B':
276
+ colors_english += 'Black, '
277
+ if c == 'R':
278
+ colors_english += 'Red, '
279
+ if c == 'G':
280
+ colors_english += 'Green, '
281
+ if c == '':
282
+ colors_english += 'Colorless, '
283
+ #remove the last comma and space
284
+ colors_english = colors_english[:-2]
285
+ return colors_english
286
+
287
+ #get the year from the release date
288
+ def get_year(date):
289
+ year = date[:4]
290
+ return year
291
+
292
+ #special function for double-faced cards - we are going to save the front and back separately
293
+ #scryfall treats them as a single card, but we want to treat them as two separate cards, so we need to split them up
294
+ def split_dfc(card):
295
+ face = 0
296
+ while face < 2:
297
+ card['illustration_id'] = card['card_faces'][face]['illustration_id']
298
+ card['name'] = card['card_faces'][face]['name']
299
+ if card['card_faces'][face].get('flavor_name'):
300
+ card['flavor_name'] = card['card_faces'][face]['flavor_name']
301
+ card['mana_cost'] = card['card_faces'][face]['mana_cost']
302
+ card['type_line'] = card['card_faces'][face]['type_line']
303
+ card['oracle_text'] = card['card_faces'][face]['oracle_text']
304
+ card['colors'] = card['card_faces'][face]['colors']
305
+ if card['card_faces'][face].get('power'):
306
+ card['power'] = card['card_faces'][face]['power']
307
+ if card['card_faces'][face].get('toughness'):
308
+ card['toughness'] = card['card_faces'][face]['toughness']
309
+ card['artist'] = card['card_faces'][face]['artist']
310
+ card['image_uris'] = card['card_faces'][face]['image_uris']
311
+ save_card_main(card)
312
+ face += 1
313
+
314
+ #clean up the data we're saving so the AI can use it
315
+ #remove curly brackets, convert long dash to hyphen
316
+ def clean_data(data):
317
+ data = str(data)
318
+ data = data.replace('{', '')
319
+ data = data.replace('}', '')
320
+ data = data.replace('—', '-')
321
+ data = data.replace('—', "-")
322
+ #remove placeholder text
323
+ data = data.replace('WATERMARK_PLACEHOLDER', '')
324
+ data = data.replace('SECURITY_STAMP_PLACEHOLDER', '')
325
+ data = data.replace('POWER_PLACEHOLDER', '')
326
+ data = data.replace('TOUGHNESS_PLACEHOLDER', '')
327
+ data = data.replace('KEYWORDS_PLACEHOLDER', '')
328
+ data = data.replace('PROMO_TYPES_PLACEHOLDER', '')
329
+ data = data.replace('SPOTLIGHT_PLACEHOLDER', '')
330
+ #remove double commas
331
+ data = data.replace(', ,', ',')
332
+ #remove ( and )
333
+ data = data.replace('(', '')
334
+ data = data.replace(')', '')
335
+ #remove square brackets
336
+ data = data.replace('[', '')
337
+ data = data.replace(']', '')
338
+ #remove single quotes
339
+ data = data.replace('\'', '')
340
+ #remove double quotes
341
+ data = data.replace('"', '')
342
+ #remove any " , " that might be left over
343
+ #data = data.replace(' , ', '')
344
+ #remove double commas
345
+ data = data.replace(',,', '')
346
+ #remove /,
347
+ data = data.replace('/,', '')
348
+ #remove " ,"
349
+ data = data.replace(' ,', ' ')
350
+ #remove double spaces
351
+ data = data.replace(' ', ' ')
352
+ #replace ǵ with g
353
+ data = data.replace('ǵ', 'g')
354
+ #replace õ with o
355
+ data = data.replace('õ', 'o')
356
+ #replace é with e
357
+ data = data.replace('é', 'e')
358
+ #replace ï with i
359
+ data = data.replace('ï', 'i')
360
+ #replace ñ with n
361
+ data = data.replace('ñ', 'n')
362
+ #replace ë with e
363
+ data = data.replace('ë', 'e')
364
+ #replace û with u
365
+ data = data.replace('û', 'u')
366
+ #replace ó with o
367
+ data = data.replace('ó', 'o')
368
+ #replace ń with n
369
+ data = data.replace('ń', 'n')
370
+ #put ' back in for the placeholder `
371
+ data = data.replace('`', '\'')
372
+ return data
373
+
374
+ #open the JSON file and load the data
375
+ with open(json_filename, 'r', encoding="utf8") as f:
376
+ card_data = json.load(f)
377
+ #loop through each card in the data, one by one, and save the image and data
378
+ for card in card_data:
379
+ #figure out if we're working with a double-faced card or a split card - we have a special function for those
380
+ if card['layout'] == 'transform' or card['layout'] == 'modal_dfc' or card['layout'] == 'reversible_card' or card['layout'] == 'double_faced_token':
381
+ split_dfc(card)
382
+ #if the layout is art_series, skip it, we don't want to save those
383
+ elif card['layout'] == 'art_series':
384
+ continue
385
+ else: save_card_main(card)
386
+
387
+
388
+