Nick Canu commited on
Commit
b0829c1
1 Parent(s): 394d881
.gitattributes CHANGED
@@ -1,34 +1,3 @@
1
- *.7z filter=lfs diff=lfs merge=lfs -text
2
- *.arrow filter=lfs diff=lfs merge=lfs -text
3
- *.bin filter=lfs diff=lfs merge=lfs -text
4
- *.bz2 filter=lfs diff=lfs merge=lfs -text
5
- *.ckpt filter=lfs diff=lfs merge=lfs -text
6
- *.ftz filter=lfs diff=lfs merge=lfs -text
7
- *.gz filter=lfs diff=lfs merge=lfs -text
8
- *.h5 filter=lfs diff=lfs merge=lfs -text
9
- *.joblib filter=lfs diff=lfs merge=lfs -text
10
- *.lfs.* filter=lfs diff=lfs merge=lfs -text
11
- *.mlmodel filter=lfs diff=lfs merge=lfs -text
12
- *.model filter=lfs diff=lfs merge=lfs -text
13
- *.msgpack filter=lfs diff=lfs merge=lfs -text
14
- *.npy filter=lfs diff=lfs merge=lfs -text
15
- *.npz filter=lfs diff=lfs merge=lfs -text
16
- *.onnx filter=lfs diff=lfs merge=lfs -text
17
- *.ot filter=lfs diff=lfs merge=lfs -text
18
- *.parquet filter=lfs diff=lfs merge=lfs -text
19
- *.pb filter=lfs diff=lfs merge=lfs -text
20
- *.pickle filter=lfs diff=lfs merge=lfs -text
21
  *.pkl filter=lfs diff=lfs merge=lfs -text
22
- *.pt filter=lfs diff=lfs merge=lfs -text
23
- *.pth filter=lfs diff=lfs merge=lfs -text
24
- *.rar filter=lfs diff=lfs merge=lfs -text
25
- *.safetensors filter=lfs diff=lfs merge=lfs -text
26
- saved_model/**/* filter=lfs diff=lfs merge=lfs -text
27
- *.tar.* filter=lfs diff=lfs merge=lfs -text
28
- *.tflite filter=lfs diff=lfs merge=lfs -text
29
- *.tgz filter=lfs diff=lfs merge=lfs -text
30
- *.wasm filter=lfs diff=lfs merge=lfs -text
31
- *.xz filter=lfs diff=lfs merge=lfs -text
32
- *.zip filter=lfs diff=lfs merge=lfs -text
33
- *.zst filter=lfs diff=lfs merge=lfs -text
34
- *tfevents* filter=lfs diff=lfs merge=lfs -text
 
1
+ *.gzip filter=lfs diff=lfs merge=lfs -text
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2
  *.pkl filter=lfs diff=lfs merge=lfs -text
3
+ *.bin filter=lfs diff=lfs merge=lfs -text
 
 
 
 
 
 
 
 
 
 
 
 
.gitignore ADDED
@@ -0,0 +1 @@
 
 
1
+ Model_Constants.py
.streamlit/config.toml ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ [theme]
2
+ primaryColor="#e76020"
3
+ backgroundColor="#FDFFFC"
4
+ secondaryBackgroundColor="#6E896A"
5
+ textColor="#0f0f0d"
6
+ font="monospace"
.vscode/launch.json ADDED
@@ -0,0 +1,16 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ // Use IntelliSense to learn about possible attributes.
3
+ // Hover to view descriptions of existing attributes.
4
+ // For more information, visit: https://go.microsoft.com/fwlink/?linkid=830387
5
+ "version": "0.2.0",
6
+ "configurations": [
7
+ {
8
+ "name": "Python: Module",
9
+ "type": "python",
10
+ "request": "launch",
11
+ "module": "streamlit",
12
+ "args": ["run", "Home.py"],
13
+ "justMyCode": true
14
+ }
15
+ ]
16
+ }
Home.py ADDED
@@ -0,0 +1,348 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+
3
+ st.set_page_config(page_title='Auto-BG: The Game Concept Generator', layout='wide')
4
+
5
+ def application():
6
+ ###Imports
7
+ import pandas as pd
8
+ import numpy as np
9
+ import re
10
+ import urllib
11
+ import pickle
12
+ import spacy
13
+ from spacy.tokens import DocBin
14
+ from title_generator import Title_Generator
15
+ import gzip
16
+ import io
17
+ from description_generator import input_manager, model_control
18
+
19
+ #UI Session Variables
20
+ if 'desc_iter' not in st.session_state:
21
+ st.session_state.desc_iter = 0
22
+ if 'title_iter' not in st.session_state:
23
+ st.session_state.title_iter = 0
24
+ if 'output_dict' not in st.session_state:
25
+ st.session_state.output_dict = {}
26
+ if 'inputs' not in st.session_state:
27
+ st.session_state.inputs = []
28
+ if 'cur_pair' not in st.session_state:
29
+ st.session_state.cur_pair = ("","Run me!")
30
+ if 'f_d' not in st.session_state:
31
+ st.session_state.f_d = None
32
+ if 'g_d' not in st.session_state:
33
+ st.session_state.g_d = None
34
+ if 'm_d' not in st.session_state:
35
+ st.session_state.m_d = None
36
+ if 'c_d' not in st.session_state:
37
+ st.session_state.c_d = None
38
+ if 'coop_d' not in st.session_state:
39
+ st.session_state.coop_d = 0
40
+
41
+ #non-ui helper functions
42
+ #reader code extended from https://gist.github.com/thearn/5424244 for alternate load format
43
+ def reader(url):
44
+ url_file = io.BytesIO(urllib.request.urlopen(url).read())
45
+ f = gzip.GzipFile(fileobj=url_file)
46
+ data = f.read()
47
+ obj = pickle.loads(data)
48
+ f.close()
49
+ return obj
50
+
51
+ def token_expand(url):
52
+ nlp = spacy.blank("en")
53
+ url_file = urllib.request.urlopen(url)
54
+ f = gzip.GzipFile(fileobj=url_file)
55
+ data = f.read()
56
+ obj = pickle.loads(data)
57
+ f.close()
58
+ doc_bin = DocBin().from_bytes(obj)
59
+ docs = list(doc_bin.get_docs(nlp.vocab))
60
+ return (docs[1:9],docs[9:192],docs[192:276],docs[276:3901])
61
+
62
+ def revert_cats(gt, mec, cat, fam, coop):
63
+ gt = ["game_type_" + x for x in gt]
64
+ mec = ["mechanic_" + x for x in mec]
65
+ cat = ["category_" + x for x in cat]
66
+ fam = ["family_" + x for x in fam if x != "Game: [redacted]"]
67
+ if coop == 1:
68
+ co = ["cooperative", "mechanic_Cooperative Game"]
69
+ else:
70
+ co = []
71
+
72
+ final_list = [gt,mec,cat,fam, co]
73
+ return [item for sublist in final_list for item in sublist]
74
+
75
+ def builder(ip):
76
+ ks = iman.input_parser(iman.set_input(ip))
77
+ mctrl.prompt_formatter(ks)
78
+ descs = []
79
+ for status in np.arange(0,3):
80
+ desc = mctrl.call_api(status=status)
81
+ clean_desc = mctrl.resp_cleanup(desc)
82
+ inter_pair = Tgen.candidate_generator(clean_desc)
83
+ out = Tgen.candidate_score(inter_pair,ex_check)
84
+ descs.append(out)
85
+ st.sidebar.success("Prompt " +str(status+1)+ " generated!")
86
+ st.session_state.output_dict = {0:descs[0],1:descs[1],2:descs[2]}
87
+
88
+
89
+
90
+ def title_check(next=0):
91
+ if next==1:
92
+ if st.session_state.title_iter == (len(st.session_state.output_dict[st.session_state.desc_iter]['titles'])-1):
93
+ st.session_state.title_iter = 0
94
+ else:
95
+ st.session_state.title_iter +=1
96
+ elif next==-1:
97
+ if st.session_state.title_iter == 0:
98
+ st.session_state.title_iter = (len(st.session_state.output_dict[st.session_state.desc_iter]['titles'])-1)
99
+ else:
100
+ st.session_state.title_iter -=1
101
+ else:
102
+ st.session_state.title_iter = 0
103
+
104
+ cur_title = st.session_state.output_dict[st.session_state.desc_iter]['titles'][st.session_state.title_iter][0]
105
+ desc = re.sub(re.compile("__"),cur_title,st.session_state.output_dict[st.session_state.desc_iter]['text'])
106
+
107
+ return (cur_title, desc.lstrip())
108
+
109
+ def show_title(val):
110
+ out = title_check(next=val)
111
+ st.session_state.cur_pair = out
112
+
113
+ def PT_button_clicked():
114
+ show_title(-1)
115
+
116
+ def NT_button_clicked():
117
+ show_title(1)
118
+
119
+ def PD_button_clicked():
120
+ if st.session_state.desc_iter == 0:
121
+ st.session_state.desc_iter = 2
122
+ st.session_state.title_iter = 0
123
+ else:
124
+ st.session_state.desc_iter -= 1
125
+ st.session_state.title_iter = 0
126
+ show_title(0)
127
+
128
+ def ND_button_clicked():
129
+ if st.session_state.desc_iter == 2:
130
+ st.session_state.desc_iter = 0
131
+ st.session_state.title_iter = 0
132
+ else:
133
+ st.session_state.desc_iter += 1
134
+ st.session_state.title_iter = 0
135
+ show_title(0)
136
+
137
+
138
+
139
+ ###Variables
140
+
141
+ ###Data
142
+ @st.cache_resource
143
+ def fetch_data():
144
+ slim_df = pd.read_parquet('https://github.com/canunj/Auto-BoardGame/blob/main/Model_Step_Data/slim_df.parquet.gzip?raw=true')
145
+ search_tokens = token_expand("https://github.com/canunj/Auto-BoardGame/blob/main/Persistent%20Objects/token_search.gz?raw=true")
146
+ vector_df = pd.read_parquet('https://github.com/canunj/Auto-BoardGame/blob/main/Model_Step_Data/vector_df.parquet.gzip?raw=true')
147
+ category_keys = reader("https://github.com/canunj/Auto-BoardGame/blob/main/Persistent%20Objects/current_keys.gz?raw=true")
148
+ coop = [1,0]
149
+ st.sidebar.success("Fetched Data!")
150
+ return slim_df, search_tokens, vector_df, category_keys, coop
151
+
152
+ slim_df, search_tokens, vector_df, category_keys, coop = fetch_data()
153
+
154
+ ex_check = ["[Ee]verquest","[Cc]ivilization [Ii][IiVv]","[Cc]ivilization(?=:)","[Cc]ivilization [Ii][Ii]",
155
+ "[Cc]ivilization [Ii][Ii][Ii]","[Cc]ivilization V","[Aa]ge [Oo]f [Ee]mpires [Ii][Ii2]([Ii]|\b)", "[Rr]avenloft|[Cc]astle [Rr]avenloft",
156
+ "[Ss]cythe(?=:|\b)","[Dd]ungeons [&Aa][ n][Dd ][ Ddr][Ddra][rg][oa][gn][os](ns|\b)",
157
+ "[Aa]ge [Oo]f [Ee]mpires [Ii][Ii]: [Tt]he [Aa]ge [Oo]f [Kk]ings","[Aa]ge [Oo]f [Ee]mpires 2: [Tt]he [Aa]ge [Oo]f [Kk]ings",
158
+ "[Aa]ge [Oo]f [Ee]mpires","Doctor Who"]
159
+
160
+ ###Models
161
+ @st.cache_resource
162
+ def setup_models():
163
+ return Title_Generator('./t5_model', slim_df), input_manager(vector_df, slim_df, search_tokens), model_control(apikey=st.secrets.key,model_id=st.secrets.model)
164
+
165
+ Tgen, iman, mctrl = setup_models()
166
+
167
+
168
+
169
+ #UI
170
+
171
+ #Intro
172
+ st.title("""Auto-BG: The Game Concept Generator""")
173
+
174
+ with st.expander("How to use", expanded=True):
175
+ st.write(
176
+ """
177
+ Discover the concept for your next favorite game!
178
+
179
+ How do you use Auto-BG?
180
+
181
+ Pick any set of tags from four selectors below: Family, Game, Mechanic, and Category.
182
+ If you are looking to lose together - activate the cooperative toggle.
183
+
184
+ See ? icons for detailed information on each type of tag.
185
+
186
+ Select any pre-configured demo below to see how Auto-BG works on the tag set for a popular board game.
187
+ """
188
+ )
189
+
190
+ results = st.empty()
191
+
192
+ with st.expander('Demos'):
193
+
194
+ st.write("""These buttons run Auto-BG on the tag set for real games you might be familiar with,
195
+ choose a button and the corresponding tags automatically fill the selectors below.
196
+ Press run and see how Auto-BG creates an alternate concept for these hit titles!
197
+ """)
198
+
199
+ b1, b2, b3 = st.columns(3)
200
+
201
+ with b1:
202
+ SoC = st.button('Catan', use_container_width=True)
203
+ if SoC:
204
+ st.session_state.f_d = [
205
+ 'Animals: Sheep',
206
+ 'Components: Hexagonal Tiles',
207
+ 'Components: Wooden pieces & boards'
208
+ ]
209
+ st.session_state.g_d = ['Family Game', 'Strategy Game']
210
+ st.session_state.m_d = [
211
+ 'Hexagon Grid',
212
+ 'Network and Route Building',
213
+ 'Random Production',
214
+ 'Trading',
215
+ 'Variable Set-up'
216
+ ]
217
+ st.session_state.c_d = [
218
+ 'Economic',
219
+ 'Negotiation'
220
+ ]
221
+ st.session_state.coop_d = 0
222
+
223
+ with b2:
224
+ TtR = st.button('Ticket to Ride', use_container_width=True)
225
+ if TtR:
226
+ st.session_state.f_d = [
227
+ 'Components: Map (Continental / National scale)',
228
+ 'Continents: North America',
229
+ 'Country: USA'
230
+ ]
231
+ st.session_state.g_d = ['Family Game']
232
+ st.session_state.m_d = [
233
+ 'Contracts',
234
+ 'End Game Bonuses',
235
+ 'Network and Route Building',
236
+ 'Push Your Luck',
237
+ 'Set Collection'
238
+ ]
239
+ st.session_state.c_d = [
240
+ 'Trains'
241
+ ]
242
+ st.session_state.coop_d = 0
243
+
244
+ with b3:
245
+ P = st.button('Pandemic', use_container_width=True)
246
+ if P:
247
+ st.session_state.f_d = [
248
+ 'Components: Map (Global Scale)',
249
+ 'Components: Multi-Use Cards',
250
+ 'Medical: Diseases',
251
+ 'Region: The World',
252
+ 'Theme: Science'
253
+ ]
254
+ st.session_state.g_d = ['Family Game', 'Strategy Game']
255
+ st.session_state.m_d = [
256
+ 'Action Points',
257
+ 'Point to Point Movement',
258
+ 'Trading',
259
+ 'Variable Player Powers'
260
+ ]
261
+ st.session_state.c_d = [
262
+ 'Medical'
263
+ ]
264
+ st.session_state.coop_d = 1
265
+
266
+ #Form
267
+ with st.expander("Auto-BG", expanded=True):
268
+
269
+ col1, col2 = st.columns(2)
270
+
271
+ with col1:
272
+ Family_v = st.multiselect("Family", options=pd.Series(category_keys[4][8:]), key='Family', default=st.session_state.f_d, max_selections=6, help='Descriptive niches for groupings of games.\n Maximum of six choices.')
273
+
274
+ with col2:
275
+ Game_v = st.multiselect("Game", options=pd.Series(category_keys[1]), key='Game', default=st.session_state.g_d, max_selections=2, help='Top level genres - Family, Strategy, etc.\n Maximum of two choices.')
276
+
277
+ col3, col4 = st.columns(2)
278
+
279
+ with col3:
280
+ Category_v = st.multiselect("Category", options=pd.Series(category_keys[3]), key='Category', default=st.session_state.c_d, max_selections=3, help='Expanded genre tags.\n Maximum of three choices.')
281
+
282
+ with col4:
283
+ Mechanics_v = st.multiselect("Mechanics", options=pd.Series([x for x in category_keys[2] if x != "Cooperative Game"]), key='Mechanic', default=st.session_state.m_d, max_selections=5, help='Game rules!\n Maximum of five choices.')
284
+
285
+ Cooperative_v = st.checkbox('Cooperative?', value=st.session_state.coop_d, key='CoopCheck')
286
+
287
+ run = st.button("Run Model", use_container_width=True)
288
+
289
+ if run:
290
+ if st.session_state.inputs == revert_cats(Game_v, Mechanics_v, Category_v, Family_v, Cooperative_v):
291
+ st.write('Inputs did not change, results currently loaded.')
292
+ else:
293
+
294
+ st.session_state.desc_iter = 0
295
+ st.session_state.title_iter = 0
296
+ st.session_state.output_dict = {}
297
+
298
+ if Cooperative_v == True:
299
+ Mechanics_v.append('Cooperative Game')
300
+
301
+ st.session_state.inputs = revert_cats(Game_v, Mechanics_v, Category_v, Family_v, Cooperative_v)
302
+ builder(st.session_state.inputs)
303
+ st.session_state.cur_pair = title_check()
304
+
305
+ if st.session_state.output_dict == {}:
306
+ results.empty()
307
+ else:
308
+ with results.expander('Results', expanded=True):
309
+
310
+ st.write(
311
+ """
312
+ #### Title:
313
+ """)
314
+
315
+
316
+
317
+ st.write(st.session_state.cur_pair[0])
318
+
319
+
320
+ t_col1, t_col2 = st.columns(2)
321
+ with t_col1:
322
+ st.button("See Previous Title", on_click=PT_button_clicked, use_container_width=True)
323
+
324
+ with t_col2:
325
+ st.button("See Next Title", on_click=NT_button_clicked, use_container_width=True)
326
+
327
+ st.write(
328
+ """
329
+ #### Description:
330
+ """)
331
+ st.write(st.session_state.cur_pair[1].replace('$','\$'))
332
+
333
+ d_col1, d_col2 = st.columns(2)
334
+ with d_col1:
335
+ st.button("See Previous Description", on_click=PD_button_clicked, use_container_width=True)
336
+
337
+ with d_col2:
338
+ st.button("See Next Description", on_click=ND_button_clicked, use_container_width=True)
339
+
340
+
341
+
342
+ page_names_to_funcs = {
343
+ "Application": application
344
+ }
345
+
346
+ demo_name = st.sidebar.selectbox("Choose a page:", page_names_to_funcs.keys())
347
+ page_names_to_funcs[demo_name]()
348
+
Model_Constants_Template.py ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ def SEND_KEY():
2
+ KEY = ""
3
+ return KEY
4
+
5
+ def SEND_MODEL():
6
+ OAI_MODEL = ""
7
+ return OAI_MODEL
Model_Step_Data/slim_df.parquet.gzip ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8eb032341c8bacc24ffee96e2a1b3201a0ab6c2837567956ba1ddb9492e056dc
3
+ size 16243764
Model_Step_Data/vector_df.parquet.gzip ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:eaf463f341982a460862da6ee77bbed38ad92ad36c4aef10bc031828681ef83f
3
+ size 3803902
Persistent Objects/current_keys.gz ADDED
Binary file (39.7 kB). View file
 
Persistent Objects/token_search.gz ADDED
Binary file (144 kB). View file
 
README.md CHANGED
@@ -1,13 +1,48 @@
1
- ---
2
- title: Auto BoardGame
3
- emoji: 🦀
4
- colorFrom: pink
5
- colorTo: pink
6
- sdk: streamlit
7
- sdk_version: 1.17.0
8
- app_file: app.py
9
- pinned: false
10
- license: cc-by-nc-sa-2.0
11
- ---
12
-
13
- Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [icon banner image placeholder]
2
+
3
+ # Auto-BG
4
+ LLM-based text generation tool for creating board game concepts (description & title)
5
+
6
+ The Auto-BG (Board Game) tool is a text generation tool for creating board game concepts. It utilizes multiple large-language models to generate board game titles and descriptions tailored from user-input tags based on BoardGameGeek.com. The models used in this project include a trained T5 sequence-to-sequence model, primarily for title generation, and a robust GPT3 model for board game description generation. The T5 model was initially presented by Raffel et al. in ["Exploring the Limits of Transfer Learning with a Unified Text-to-Text Transformer"](https://arxiv.org/pdf/1910.10683.pdf). The GPT3 model builds from Brown et al.'s work in ["Language Models are Few-Shot Learners"](https://arxiv.org/pdf/1910.10683.pdf).
7
+
8
+
9
+ ## Table of Contents
10
+ - Features and Demo
11
+ - Examples
12
+ - Project Structure
13
+ - Customizing Auto-BG
14
+ - Citations and Licensing
15
+
16
+ ## Features and Demo
17
+ The main features of this application include:
18
+
19
+ A user-friendly interface for Auto-BG can be found at (homepage).
20
+
21
+ ## Examples
22
+
23
+ ## Project Structure
24
+
25
+ ## Customizing Auto-BG
26
+ NOTE: Auto-BG uses a fine-tuned GPT-3 Curie model that will be inaccessible without an organizational API key,
27
+ the below instructions are for advanced users interested in remixing Auto-BG with a new generator model.
28
+
29
+ In order to run this application, you will need the following:
30
+ 1. An OpenAI account and API key
31
+ 2. All libraries specified in both the primary and data processing requirements.txt files
32
+ 3. A raw stream JSON file of BoardGameGeek data, formatted to match output from the Recommend.Games scraper
33
+
34
+ To implement a new instance of Auto-BG, follow these steps:
35
+ 1. Clone the repository onto your local machine
36
+ 2. Install the required packages listed in both 'requirements.txt' files using pip
37
+ 3. Download the trained T5 model or provide a path to an alternate T5 model.
38
+ 4. Placing the JSON data file in Stream_to_Output, run GameCleaner.py - this provides all required data files.
39
+
40
+ 5. Prepare training prompts - convert all active keys to period stopped tokens in a string for each game.
41
+ 6. Fine-tune a selected model following the instructions at: https://platform.openai.com/docs/guides/fine-tuning
42
+ NOTE: Auto-BG uses a Curie model with a lowered learning rate running for fewer epochs.
43
+
44
+ 8. Create a Model_Constants.py file with your personal API key and model instance based on the template above.
45
+ 9. You now have a customized instance of Auto-BG!
46
+
47
+ ## Citations and Licensing
48
+ Auto-BG is licensed under CC BY-NC-SA 2.0, original data sourced from Recommend.Games @GitLab
Stream_to_Output/GameCleaner.py ADDED
@@ -0,0 +1,144 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pandas as pd
2
+ import numpy as np
3
+ import re
4
+ import nltk
5
+ from nltk.corpus import stopwords
6
+ from gensim.parsing import preprocess_string, strip_tags, strip_numeric, strip_multiple_whitespaces, stem_text, strip_punctuation, remove_stopwords
7
+ import spacy
8
+ from langdetect import detect
9
+ import pickle
10
+ import gzip
11
+ nltk.download('stopwords')
12
+
13
+ #function definitions
14
+
15
+ #strips values out of encoded stream lists
16
+ def text_col_cleaner(frame, cols, pattern):
17
+
18
+ pattern = re.compile(pattern)
19
+
20
+ for col in cols:
21
+ frame[col] = frame[col].map(lambda x: [re.findall(pattern,val)[0].strip() for val in x], na_action='ignore')
22
+ return frame
23
+
24
+ #converts specified columns to one-hot
25
+ def encode_columns(frame):
26
+ targets = list(frame.columns)
27
+ for t in targets:
28
+ one_hot = pd.get_dummies(frame[t].apply(pd.Series).stack(),prefix=t).groupby(level=0).sum()
29
+ frame = pd.concat([frame,one_hot],axis=1)
30
+ return frame
31
+
32
+ #custom text processor for tokenizing descriptions by Kuan Chen & Nick Canu
33
+ def doc_text_preprocessing(ser):
34
+ nlp=spacy.load("en_core_web_sm", exclude=['parser','ner','textcat'])
35
+
36
+ """text processing steps"""
37
+ stop_words=set(stopwords.words('english'))
38
+ stop_words.update(['game','player','players','games', 'also',
39
+ 'description','publisher'])
40
+
41
+ single_letter_replace=lambda c: re.sub("\s+\w{1}\s+|\n|-|—",'',c)
42
+ to_lower_func=lambda c: c.lower()
43
+
44
+ lemma_text=[preprocess_string(
45
+ ' '.join([token.lemma_ for token in desc]
46
+ ),[remove_stopwords,strip_numeric,strip_punctuation,strip_tags,
47
+ strip_multiple_whitespaces,single_letter_replace,to_lower_func]
48
+ ) for desc in ser.apply(lambda x: nlp(x))]
49
+
50
+ tokenize_text=[[word for word in string if word not in stop_words] for string in lemma_text]
51
+
52
+ return tokenize_text
53
+
54
+ #performs english language detection on the descriptions w/langdetect then additionally drops games using non-english characters in the name
55
+ def lang_cleanup(frame):
56
+ nlp=spacy.load("en_core_web_sm")
57
+ frame['description']=frame['description'].fillna('no words')
58
+ frame = frame[frame['description']!='no words']
59
+ frame['cleaned_descriptions']=doc_text_preprocessing(frame['description'])
60
+
61
+ detected_lang = []
62
+ for word in frame.cleaned_descriptions:
63
+ word=', '.join(word)
64
+ detected_lang.append(detect(word))
65
+ frame['lang'] = detected_lang
66
+ frame = frame[frame['lang']=='en']
67
+
68
+ non_eng_title_filter = frame['name'].str.contains('[^\x00-\x7f]', flags=re.IGNORECASE)
69
+ return frame[~non_eng_title_filter]
70
+
71
+
72
+ #column name stripper for creating key values
73
+ def column_fixer(frame,targ):
74
+ return [col.replace(targ, "").strip('"') for col in frame.columns if col.startswith(targ)]
75
+
76
+ #creates key list for defining web app lists & nlp tokens of the same unknown input search
77
+ def key_collator(frame):
78
+ nlp=spacy.load("en_core_web_sm")
79
+ fam = column_fixer(frame,'family_')
80
+ gt = column_fixer(frame,'game_type_')
81
+ mec = column_fixer(frame,'mechanic_')
82
+ cat = column_fixer(frame,'category_')
83
+
84
+ current_keys = (['cooperative'],gt,mec,cat,fam)
85
+
86
+ fam_keys = [nlp(w) for w in fam]
87
+ gt_keys = [nlp(w) for w in gt]
88
+ mec_keys = [nlp(w) for w in mec]
89
+ cat_keys = [nlp(w) for w in cat]
90
+
91
+ search_tokens = (gt_keys,mec_keys,cat_keys,fam_keys)
92
+
93
+ return current_keys, search_tokens
94
+
95
+
96
+ #-----------
97
+
98
+ #reading in raw file & removing unranked and compilation game items
99
+ df = pd.read_json(r'./bgg_GameItem.jl', lines=True)
100
+ df['rank'] = df['rank'].fillna(0).astype(int)
101
+ df = df[(df['rank']>0) & (df['compilation']!=1)]
102
+
103
+ #separating and cleaning the one-hot target columns
104
+ in_df = text_col_cleaner(frame = df[['game_type','mechanic','category','family']],
105
+ cols = ['game_type','mechanic','category','family'],
106
+ pattern = re.compile("([\S ]+)(?=:)"))
107
+
108
+ print('Text has been cleaned, now encoding one-hot columns')
109
+
110
+ #encoding one-hot columns and rejoining to features for output
111
+ proc_df = encode_columns(in_df)
112
+ step = df[['name','description','cooperative']]
113
+ join_df = pd.concat([step,proc_df.drop(['game_type','mechanic','category','family',
114
+ 'game_type_Amiga','game_type_Arcade','game_type_Atari ST',
115
+ 'game_type_Commodore 64'],axis=1)],axis=1)
116
+
117
+ print('Columns encoded, now performing english language detection and cleanup')
118
+
119
+ #english language detection steps & first data save
120
+ eng_df = lang_cleanup(join_df)
121
+ eng_df = eng_df.loc[:,~eng_df.columns.duplicated()].copy().reset_index(drop=True).fillna(0)
122
+
123
+ print('Creating vector-only dataframe & saving output')
124
+
125
+ #vector only data for operations
126
+ vector_df = eng_df.copy().drop(['name','description','cleaned_descriptions','lang'],axis=1)
127
+
128
+ eng_df.to_parquet('game_data.parquet.gzip',compression='gzip')
129
+ vector_df.to_parquet('game_vectors.parquet.gzip',compression='gzip')
130
+
131
+ print('Creating key lists')
132
+
133
+ #creating key lists - 1. string list of values by feature class for defining input selections & 2. nlp processed list for unknown input search
134
+ keys, search_toks = key_collator(vector_df)
135
+
136
+ with gzip.open("current_keys.gz", "wb") as f:
137
+ pickle.dump(keys, f)
138
+ f.close()
139
+
140
+ with gzip.open("key_search_tokens.gz", "wb") as f:
141
+ pickle.dump(search_toks, f)
142
+ f.close()
143
+
144
+ print('File creation is complete')
Stream_to_Output/requirements.txt ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ gensim==4.3.1
2
+ langdetect==1.0.9
3
+ nltk==3.8.1
4
+ numpy==1.24.2
5
+ pandas==1.3.2
6
+ spacy==3.5.1
__pycache__/Model_Constants.cpython-39.pyc ADDED
Binary file (457 Bytes). View file
 
__pycache__/description_generator.cpython-39.pyc ADDED
Binary file (4.62 kB). View file
 
__pycache__/title_generator.cpython-39.pyc ADDED
Binary file (6.8 kB). View file
 
description_generator.py ADDED
@@ -0,0 +1,120 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ import numpy as np
3
+ import re
4
+ import spacy
5
+ import openai
6
+ from operator import itemgetter
7
+ #user input manager class
8
+ class input_manager:
9
+
10
+ #initialize key dictionary from vector data frame and set community top N
11
+ def __init__(self,key_df, slim_df, search_tokens, top_n=10):
12
+ self.key_df = key_df
13
+ self.slim_df = slim_df
14
+ self.search_tokens = search_tokens
15
+ self.key = dict(zip(list(key_df.columns),np.zeros(len(key_df.columns))))
16
+ self.top_n = top_n
17
+
18
+ #translate input text to vector
19
+ def set_input(self,input_cats):
20
+ #need setup to apply correct group tag to values
21
+ nlp=spacy.load("en_core_web_md")
22
+ #separate known/unknown features
23
+ k_flags = [cat for cat in input_cats if cat in list(self.key.keys())]
24
+ unk_flags = [cat for cat in input_cats if cat not in list(self.key.keys())]
25
+
26
+ #process within feature class similarity for each unknown input
27
+ if len(unk_flags)>0:
28
+
29
+ outs = []
30
+ for word in unk_flags:
31
+ if re.match(r"game_type_",word):
32
+ tok = nlp(word.split("_")[-1])
33
+ mtch = max([(key,key.similarity(tok)) for key in self.search_tokens[0]],key=itemgetter(1))
34
+ #if no known match is found (model doesn't recognize input word), we're going to discard - other solutions performance prohibitive
35
+ if mtch[1]>0:
36
+ outs.append("game_type_"+mtch[0])
37
+ elif re.match(r"mechanic_",word):
38
+ tok = nlp(word.split("_")[-1])
39
+ mtch = max([(key,key.similarity(tok)) for key in self.search_tokens[1]],key=itemgetter(1))
40
+ if mtch[1]>0:
41
+ outs.append("mechanic_"+mtch[0])
42
+ elif re.match(r"category_",word):
43
+ tok = nlp(word.split("_")[-1])
44
+ mtch=max([(key,key.similarity(tok)) for key in self.search_tokens[2]],key=itemgetter(1))
45
+ if mtch[1]>0:
46
+ outs.append("category_"+mtch[0])
47
+ elif re.match(r"family_",word):
48
+ tok = nlp(word.split("_")[-1])
49
+ mtch=max([(key,key.similarity(tok)) for key in self.search_tokens[3]],key=itemgetter(1))
50
+ if mtch[1]>0:
51
+ outs.append("family_"+str(mtch[0]))
52
+
53
+ #if unks are processed, rejoin nearest match to known.
54
+ k_flags = list(set(k_flags+outs))
55
+
56
+ #preserve global key and ouput copy w/input keys activated to 1
57
+ d = self.key.copy()
58
+ for cat in k_flags:
59
+ d[cat] = 1.0
60
+
61
+ # DELETE ME
62
+ return d
63
+
64
+ def input_parser(self,in_vec):
65
+ #extracting keys from processed vector
66
+ ks = [k for k,v in in_vec.items() if v == 1]
67
+
68
+ return ks
69
+
70
+ class model_control:
71
+ def __init__(self, apikey, model_id):
72
+ self.api_key = apikey
73
+ openai.api_key = self.api_key
74
+
75
+ self.prompt = None
76
+
77
+ self.model = openai.FineTune.retrieve(id=model_id).fine_tuned_model
78
+
79
+ def prompt_formatter(self,ks):
80
+ self.prompt = ". ".join(ks) + "\n\n###\n\n"
81
+
82
+
83
+
84
+ def call_api(self,status=0):
85
+ if status == 0:
86
+ temp=0.5
87
+ pres=0.7
88
+ elif status == 1:
89
+ temp=0.4
90
+ pres=0.6
91
+ elif status == 2:
92
+ temp=0.5
93
+ pres=0.8
94
+
95
+ answer = openai.Completion.create(
96
+ model=self.model,
97
+ prompt=self.prompt,
98
+ max_tokens=512,
99
+ temperature=temp,
100
+ stop=["END"],
101
+ presence_penalty=pres,
102
+ frequency_penalty=0.5
103
+ )
104
+ return answer['choices'][0]['text']
105
+
106
+ def resp_cleanup(self,text):
107
+
108
+ if ((text[-1] != "!") & (text[-1] != ".") & (text[-1] != "?")):
109
+ text = " ".join([e+'.' for e in text.split('.')[0:-1] if e])
110
+
111
+ sent = re.split(r'([.?!:])', text)
112
+ phrases = ["[Dd]esigned by","[Dd]esigner of","[Aa]rt by","[Aa]rtist of","[Pp]ublished","[Pp]ublisher of"]
113
+
114
+ pat = re.compile("(?:" + "|".join(phrases) + ")")
115
+ fix = re.compile("(?<=[.!?])[.!?]")
116
+
117
+ text = re.sub(fix,'',''.join([s for s in sent if pat.search(s) == None]))
118
+
119
+
120
+ return text
requirements.txt ADDED
@@ -0,0 +1,11 @@
 
 
 
 
 
 
 
 
 
 
 
 
1
+ gensim==4.3.1
2
+ langdetect==1.0.9
3
+ nltk==3.8.1
4
+ numpy==1.24.2
5
+ openai==0.27.2
6
+ pandas==1.3.2
7
+ scikit_learn==1.2.2
8
+ spacy==3.5.1
9
+ streamlit==1.20.0
10
+ torch==2.0.0
11
+ transformers==4.27.3
t5_model/config.json ADDED
@@ -0,0 +1,60 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_name_or_path": "Michau/t5-base-en-generate-headline",
3
+ "architectures": [
4
+ "T5ForConditionalGeneration"
5
+ ],
6
+ "d_ff": 3072,
7
+ "d_kv": 64,
8
+ "d_model": 768,
9
+ "decoder_start_token_id": 0,
10
+ "dense_act_fn": "relu",
11
+ "dropout_rate": 0.1,
12
+ "eos_token_id": 1,
13
+ "feed_forward_proj": "relu",
14
+ "initializer_factor": 1.0,
15
+ "is_encoder_decoder": true,
16
+ "is_gated_act": false,
17
+ "layer_norm_epsilon": 1e-06,
18
+ "model_type": "t5",
19
+ "n_positions": 512,
20
+ "num_decoder_layers": 12,
21
+ "num_heads": 12,
22
+ "num_layers": 12,
23
+ "output_past": true,
24
+ "pad_token_id": 0,
25
+ "relative_attention_max_distance": 128,
26
+ "relative_attention_num_buckets": 32,
27
+ "task_specific_params": {
28
+ "summarization": {
29
+ "early_stopping": true,
30
+ "length_penalty": 2.0,
31
+ "max_length": 200,
32
+ "min_length": 30,
33
+ "no_repeat_ngram_size": 3,
34
+ "num_beams": 4,
35
+ "prefix": "summarize: "
36
+ },
37
+ "translation_en_to_de": {
38
+ "early_stopping": true,
39
+ "max_length": 300,
40
+ "num_beams": 4,
41
+ "prefix": "translate English to German: "
42
+ },
43
+ "translation_en_to_fr": {
44
+ "early_stopping": true,
45
+ "max_length": 300,
46
+ "num_beams": 4,
47
+ "prefix": "translate English to French: "
48
+ },
49
+ "translation_en_to_ro": {
50
+ "early_stopping": true,
51
+ "max_length": 300,
52
+ "num_beams": 4,
53
+ "prefix": "translate English to Romanian: "
54
+ }
55
+ },
56
+ "torch_dtype": "float32",
57
+ "transformers_version": "4.26.1",
58
+ "use_cache": true,
59
+ "vocab_size": 32128
60
+ }
t5_model/generation_config.json ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ {
2
+ "_from_model_config": true,
3
+ "decoder_start_token_id": 0,
4
+ "eos_token_id": 1,
5
+ "pad_token_id": 0,
6
+ "transformers_version": "4.26.1"
7
+ }
t5_model/pytorch_model.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e3f73b04bb3e12b9bd1f02b88f98648da9c317f734a61e9805ae385c1c57671d
3
+ size 891702929
t5_model/special_tokens_map.json ADDED
@@ -0,0 +1,107 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "additional_special_tokens": [
3
+ "<extra_id_0>",
4
+ "<extra_id_1>",
5
+ "<extra_id_2>",
6
+ "<extra_id_3>",
7
+ "<extra_id_4>",
8
+ "<extra_id_5>",
9
+ "<extra_id_6>",
10
+ "<extra_id_7>",
11
+ "<extra_id_8>",
12
+ "<extra_id_9>",
13
+ "<extra_id_10>",
14
+ "<extra_id_11>",
15
+ "<extra_id_12>",
16
+ "<extra_id_13>",
17
+ "<extra_id_14>",
18
+ "<extra_id_15>",
19
+ "<extra_id_16>",
20
+ "<extra_id_17>",
21
+ "<extra_id_18>",
22
+ "<extra_id_19>",
23
+ "<extra_id_20>",
24
+ "<extra_id_21>",
25
+ "<extra_id_22>",
26
+ "<extra_id_23>",
27
+ "<extra_id_24>",
28
+ "<extra_id_25>",
29
+ "<extra_id_26>",
30
+ "<extra_id_27>",
31
+ "<extra_id_28>",
32
+ "<extra_id_29>",
33
+ "<extra_id_30>",
34
+ "<extra_id_31>",
35
+ "<extra_id_32>",
36
+ "<extra_id_33>",
37
+ "<extra_id_34>",
38
+ "<extra_id_35>",
39
+ "<extra_id_36>",
40
+ "<extra_id_37>",
41
+ "<extra_id_38>",
42
+ "<extra_id_39>",
43
+ "<extra_id_40>",
44
+ "<extra_id_41>",
45
+ "<extra_id_42>",
46
+ "<extra_id_43>",
47
+ "<extra_id_44>",
48
+ "<extra_id_45>",
49
+ "<extra_id_46>",
50
+ "<extra_id_47>",
51
+ "<extra_id_48>",
52
+ "<extra_id_49>",
53
+ "<extra_id_50>",
54
+ "<extra_id_51>",
55
+ "<extra_id_52>",
56
+ "<extra_id_53>",
57
+ "<extra_id_54>",
58
+ "<extra_id_55>",
59
+ "<extra_id_56>",
60
+ "<extra_id_57>",
61
+ "<extra_id_58>",
62
+ "<extra_id_59>",
63
+ "<extra_id_60>",
64
+ "<extra_id_61>",
65
+ "<extra_id_62>",
66
+ "<extra_id_63>",
67
+ "<extra_id_64>",
68
+ "<extra_id_65>",
69
+ "<extra_id_66>",
70
+ "<extra_id_67>",
71
+ "<extra_id_68>",
72
+ "<extra_id_69>",
73
+ "<extra_id_70>",
74
+ "<extra_id_71>",
75
+ "<extra_id_72>",
76
+ "<extra_id_73>",
77
+ "<extra_id_74>",
78
+ "<extra_id_75>",
79
+ "<extra_id_76>",
80
+ "<extra_id_77>",
81
+ "<extra_id_78>",
82
+ "<extra_id_79>",
83
+ "<extra_id_80>",
84
+ "<extra_id_81>",
85
+ "<extra_id_82>",
86
+ "<extra_id_83>",
87
+ "<extra_id_84>",
88
+ "<extra_id_85>",
89
+ "<extra_id_86>",
90
+ "<extra_id_87>",
91
+ "<extra_id_88>",
92
+ "<extra_id_89>",
93
+ "<extra_id_90>",
94
+ "<extra_id_91>",
95
+ "<extra_id_92>",
96
+ "<extra_id_93>",
97
+ "<extra_id_94>",
98
+ "<extra_id_95>",
99
+ "<extra_id_96>",
100
+ "<extra_id_97>",
101
+ "<extra_id_98>",
102
+ "<extra_id_99>"
103
+ ],
104
+ "eos_token": "</s>",
105
+ "pad_token": "<pad>",
106
+ "unk_token": "<unk>"
107
+ }
t5_model/spiece.model ADDED
Binary file (792 kB). View file
 
t5_model/tokenizer_config.json ADDED
@@ -0,0 +1,114 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "additional_special_tokens": [
3
+ "<extra_id_0>",
4
+ "<extra_id_1>",
5
+ "<extra_id_2>",
6
+ "<extra_id_3>",
7
+ "<extra_id_4>",
8
+ "<extra_id_5>",
9
+ "<extra_id_6>",
10
+ "<extra_id_7>",
11
+ "<extra_id_8>",
12
+ "<extra_id_9>",
13
+ "<extra_id_10>",
14
+ "<extra_id_11>",
15
+ "<extra_id_12>",
16
+ "<extra_id_13>",
17
+ "<extra_id_14>",
18
+ "<extra_id_15>",
19
+ "<extra_id_16>",
20
+ "<extra_id_17>",
21
+ "<extra_id_18>",
22
+ "<extra_id_19>",
23
+ "<extra_id_20>",
24
+ "<extra_id_21>",
25
+ "<extra_id_22>",
26
+ "<extra_id_23>",
27
+ "<extra_id_24>",
28
+ "<extra_id_25>",
29
+ "<extra_id_26>",
30
+ "<extra_id_27>",
31
+ "<extra_id_28>",
32
+ "<extra_id_29>",
33
+ "<extra_id_30>",
34
+ "<extra_id_31>",
35
+ "<extra_id_32>",
36
+ "<extra_id_33>",
37
+ "<extra_id_34>",
38
+ "<extra_id_35>",
39
+ "<extra_id_36>",
40
+ "<extra_id_37>",
41
+ "<extra_id_38>",
42
+ "<extra_id_39>",
43
+ "<extra_id_40>",
44
+ "<extra_id_41>",
45
+ "<extra_id_42>",
46
+ "<extra_id_43>",
47
+ "<extra_id_44>",
48
+ "<extra_id_45>",
49
+ "<extra_id_46>",
50
+ "<extra_id_47>",
51
+ "<extra_id_48>",
52
+ "<extra_id_49>",
53
+ "<extra_id_50>",
54
+ "<extra_id_51>",
55
+ "<extra_id_52>",
56
+ "<extra_id_53>",
57
+ "<extra_id_54>",
58
+ "<extra_id_55>",
59
+ "<extra_id_56>",
60
+ "<extra_id_57>",
61
+ "<extra_id_58>",
62
+ "<extra_id_59>",
63
+ "<extra_id_60>",
64
+ "<extra_id_61>",
65
+ "<extra_id_62>",
66
+ "<extra_id_63>",
67
+ "<extra_id_64>",
68
+ "<extra_id_65>",
69
+ "<extra_id_66>",
70
+ "<extra_id_67>",
71
+ "<extra_id_68>",
72
+ "<extra_id_69>",
73
+ "<extra_id_70>",
74
+ "<extra_id_71>",
75
+ "<extra_id_72>",
76
+ "<extra_id_73>",
77
+ "<extra_id_74>",
78
+ "<extra_id_75>",
79
+ "<extra_id_76>",
80
+ "<extra_id_77>",
81
+ "<extra_id_78>",
82
+ "<extra_id_79>",
83
+ "<extra_id_80>",
84
+ "<extra_id_81>",
85
+ "<extra_id_82>",
86
+ "<extra_id_83>",
87
+ "<extra_id_84>",
88
+ "<extra_id_85>",
89
+ "<extra_id_86>",
90
+ "<extra_id_87>",
91
+ "<extra_id_88>",
92
+ "<extra_id_89>",
93
+ "<extra_id_90>",
94
+ "<extra_id_91>",
95
+ "<extra_id_92>",
96
+ "<extra_id_93>",
97
+ "<extra_id_94>",
98
+ "<extra_id_95>",
99
+ "<extra_id_96>",
100
+ "<extra_id_97>",
101
+ "<extra_id_98>",
102
+ "<extra_id_99>"
103
+ ],
104
+ "eos_token": "</s>",
105
+ "extra_ids": 100,
106
+ "model_max_length": 512,
107
+ "name_or_path": "Michau/t5-base-en-generate-headline",
108
+ "pad_token": "<pad>",
109
+ "sp_model_kwargs": {},
110
+ "special_tokens_map_file": "/root/.cache/huggingface/hub/models--Michau--t5-base-en-generate-headline/snapshots/f526532f788c45b6b6288286e5ef929fa768ef6a/special_tokens_map.json",
111
+ "tokenizer_class": "T5Tokenizer",
112
+ "truncate": true,
113
+ "unk_token": "<unk>"
114
+ }
title_generator.py ADDED
@@ -0,0 +1,148 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pandas as pd
2
+ import re
3
+ from nltk.corpus import stopwords
4
+ from gensim.parsing import preprocess_string, strip_tags, strip_numeric, strip_multiple_whitespaces, stem_text, strip_punctuation, remove_stopwords
5
+ import spacy
6
+ import torch
7
+ from transformers import T5ForConditionalGeneration,T5Tokenizer
8
+
9
+ #Custom text tokenizer from https://github.com/canunj/deconstructing_games by N Canu & K Chen
10
+ def doc_text_preprocessing(ser):
11
+ nlp=spacy.load("en_core_web_sm", exclude=['parser','ner','textcat'])
12
+
13
+ """text processing steps"""
14
+ import re
15
+ stop_words=set(stopwords.words('english'))
16
+
17
+ single_letter_replace=lambda c: re.sub("\s+\w{1}\s+|\n|-|—",'',c)
18
+ to_lower_func=lambda c: c.lower()
19
+ lemma_text=[preprocess_string(
20
+ ' '.join([token.lemma_ for token in desc]
21
+ ),[remove_stopwords,strip_numeric,strip_punctuation,strip_tags,
22
+ strip_multiple_whitespaces,single_letter_replace,to_lower_func]
23
+ ) for desc in ser.apply(lambda x: nlp(x))]
24
+
25
+ tokenize_text=[[word for word in string if word not in stop_words] for string in lemma_text]
26
+
27
+ return tokenize_text
28
+
29
+ class Title_Generator:
30
+
31
+ def __init__(self, path, df):
32
+ self.model = T5ForConditionalGeneration.from_pretrained(path)
33
+ self.tokenizer = T5Tokenizer.from_pretrained(path)
34
+ self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
35
+ self.model.to(self.device)
36
+ self.game_df = df
37
+
38
+ self.title_iter = -1
39
+ self.out_titles = None
40
+ self.best_title = None
41
+ self.description = None
42
+
43
+
44
+ def candidate_generator(self, description):
45
+ text = "headline: " + description
46
+
47
+ encoding = self.tokenizer.encode_plus(text, return_tensors = "pt")
48
+ input_ids = encoding["input_ids"].to(self.device)
49
+ attention_masks = encoding["attention_mask"].to(self.device)
50
+
51
+ candidates = []
52
+
53
+ beam_outputs = self.model.generate(
54
+ input_ids = input_ids,
55
+ attention_mask = attention_masks,
56
+ max_length = 64,
57
+ num_beams = 16,
58
+ num_beam_groups=4,
59
+ num_return_sequences=8,
60
+ diversity_penalty=.1,
61
+ repetition_penalty=.9,
62
+ early_stopping = True)
63
+
64
+ for result in beam_outputs:
65
+ res = self.tokenizer.decode(result).replace('<pad> ','').replace('</s>','').replace('<pad>','')
66
+ candidates.append(res)
67
+
68
+ return candidates, description
69
+
70
+ def candidate_score(self,candidates,ex_check=None):
71
+ import random
72
+ from operator import itemgetter
73
+
74
+ if ex_check != None:
75
+ pat = re.compile("((?:" + "|".join(map(re.escape, candidates[0]+[cand.upper() for cand in candidates[0]])) + "|" + "|".join(ex_check) +"))")
76
+ desc = re.sub(pat, "__", candidates[1])
77
+ else:
78
+ pat = re.compile("((?:" + "|".join(map(re.escape, candidates[0]+[cand.upper() for cand in candidates[0]])) + "))")
79
+ desc = re.sub(pat, "__", candidates[1])
80
+
81
+
82
+ if re.search(re.compile(re.escape("__")), desc):
83
+ reg = re.compile("("+"|".join(ex_check) + ")")
84
+ hold = candidates[0]
85
+ gen_desc = re.sub(re.compile(re.escape("__")),"",desc)
86
+ candidates = self.candidate_generator(gen_desc)
87
+ next = [cand for cand in candidates[0]+hold if not reg.search(cand)]
88
+ candidates = (next, desc)
89
+
90
+ #backup load function, will refactor
91
+ nlp=spacy.load("en_core_web_md")
92
+
93
+ #check for existing games and duplicates
94
+ #transform function from https://stackoverflow.com/questions/42165779/python-how-to-remove-duplicate-valuescase-insensitive-from-a-list-with-same-o
95
+ def transform(L):
96
+ S = set(L)
97
+ return [item.title() for item in L if item.lower() not in S and not S.add(item.lower())]
98
+
99
+
100
+ clean_cand_step = list(set([game[0] for game in list(zip(candidates[0],[len(self.game_df[self.game_df.name.isin([x])]) for x in candidates[0]])) if game[1]==0]))
101
+ clean_cand_step = transform(clean_cand_step)
102
+
103
+ clean_cand_step = [re.sub(re.compile("(?<=[ ])And(?=[ ])"),'and',
104
+ re.sub(re.compile('(?<=\S) (([(]|\b)[Ss]econd [Ee]dition([)]|\b)|[Ss]econd [Ee]dition|2[Nn][Dd] [Ee]dition|([(]|\b)[Tt]hird [Ee]dition([)]|\b)|3[Rr][Dd] [Ee]dition)|["]Second Edition["]'),"",
105
+ re.sub(re.compile("(?<=[a-z])'S"),"'s",
106
+ re.sub(re.compile("(?<=[ ])Of(?=[ ])"),"of",x))))
107
+ for x in clean_cand_step]
108
+
109
+
110
+ clean_cand = []
111
+ for cand in clean_cand_step:
112
+ try:
113
+ inter = cand.split(":")
114
+ if inter[0].lower()==inter[1].lower():
115
+ clean_cand.append(inter[0])
116
+ else:
117
+ clean_cand.append(cand)
118
+ except:
119
+ clean_cand.append(cand)
120
+
121
+ #text processing
122
+ token_cand = doc_text_preprocessing(pd.Series(clean_cand))
123
+ token_art = doc_text_preprocessing(pd.Series([candidates[1]]))
124
+ sim = [nlp(title) for title in [" ".join(title) for title in token_cand]]
125
+ doc = nlp(" ".join(token_art[0]))
126
+
127
+ #scores cosine similarity between generated titles and body text, if the word is unknown (i.e. generator knows it but spacy doesn't)
128
+ #it assigns a random probability to populate
129
+
130
+ scores = [x if x !=0 else random.uniform(.3, .7) for x in [tok.similarity(doc) for tok in sim]]
131
+
132
+ out_titles = sorted(list(zip(clean_cand,scores)),key=itemgetter(1),reverse=True)
133
+
134
+ pat = re.compile("(?<=[!.?])(?=[^\s])")
135
+ pat2 = re.compile("([Ff]rom the [Pp]ublisher[: ]|[Ff]rom the [Dd]esigner[: ]|[Gg]ame [Dd]escription)")
136
+ pat3 = re.compile(": [Tt]he [Gg]ame: [Tt]he [Gg]ame|: [Tt]he [Gg]ame")
137
+ pat4 = re.compile("[Tt]he __")
138
+ pat5 = re.compile("__ [Gg]ame")
139
+ pat6 = re.compile("[Tt]he [Gg]ame [Oo]f __")
140
+
141
+ desc = re.sub(pat," ",candidates[1])
142
+ desc = re.sub(pat2,"",desc)
143
+ desc = re.sub(pat3,"",desc)
144
+ desc = re.sub(pat4,"__",desc)
145
+ desc = re.sub(pat5,"__",desc)
146
+ desc = re.sub(pat6,"__",desc)
147
+
148
+ return {'text':desc,'titles':out_titles}