File size: 5,212 Bytes
b0829c1
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
import pandas as pd
import numpy as np
import re
import nltk
from nltk.corpus import stopwords
from gensim.parsing import preprocess_string, strip_tags, strip_numeric, strip_multiple_whitespaces, stem_text, strip_punctuation, remove_stopwords
import spacy
from langdetect import detect
import pickle
import gzip
nltk.download('stopwords')

#function definitions

#strips values out of encoded stream lists
def text_col_cleaner(frame, cols, pattern):
    
    pattern = re.compile(pattern)
    
    for col in cols:
      frame[col] = frame[col].map(lambda x: [re.findall(pattern,val)[0].strip() for val in x], na_action='ignore')
    return frame

#converts specified columns to one-hot
def encode_columns(frame):
    targets = list(frame.columns)
    for t in targets:
        one_hot = pd.get_dummies(frame[t].apply(pd.Series).stack(),prefix=t).groupby(level=0).sum()
        frame = pd.concat([frame,one_hot],axis=1)
    return frame

#custom text processor for tokenizing descriptions by Kuan Chen & Nick Canu
def doc_text_preprocessing(ser):
  nlp=spacy.load("en_core_web_sm", exclude=['parser','ner','textcat'])

  """text processing steps"""
  stop_words=set(stopwords.words('english'))
  stop_words.update(['game','player','players','games', 'also', 
                     'description','publisher'])
  
  single_letter_replace=lambda c: re.sub("\s+\w{1}\s+|\n|-|—",'',c)
  to_lower_func=lambda c: c.lower()

  lemma_text=[preprocess_string(
      ' '.join([token.lemma_ for token in desc]
          ),[remove_stopwords,strip_numeric,strip_punctuation,strip_tags,
             strip_multiple_whitespaces,single_letter_replace,to_lower_func]
             ) for desc in ser.apply(lambda x: nlp(x))]

  tokenize_text=[[word for word in string if word not in stop_words] for string in lemma_text]

  return tokenize_text

#performs english language detection on the descriptions w/langdetect then additionally drops games using non-english characters in the name
def lang_cleanup(frame):
  nlp=spacy.load("en_core_web_sm")
  frame['description']=frame['description'].fillna('no words')
  frame = frame[frame['description']!='no words']
  frame['cleaned_descriptions']=doc_text_preprocessing(frame['description'])

  detected_lang = []
  for word in frame.cleaned_descriptions:
    word=', '.join(word)
    detected_lang.append(detect(word))
  frame['lang'] = detected_lang
  frame = frame[frame['lang']=='en']

  non_eng_title_filter = frame['name'].str.contains('[^\x00-\x7f]', flags=re.IGNORECASE)
  return frame[~non_eng_title_filter]


#column name stripper for creating key values
def column_fixer(frame,targ):
  return [col.replace(targ, "").strip('"') for col in frame.columns if col.startswith(targ)]

#creates key list for defining web app lists & nlp tokens of the same unknown input search
def key_collator(frame):
  nlp=spacy.load("en_core_web_sm")
  fam = column_fixer(frame,'family_')
  gt = column_fixer(frame,'game_type_')
  mec = column_fixer(frame,'mechanic_')
  cat = column_fixer(frame,'category_')

  current_keys = (['cooperative'],gt,mec,cat,fam)

  fam_keys = [nlp(w) for w in fam]
  gt_keys = [nlp(w) for w in gt]
  mec_keys = [nlp(w) for w in mec]
  cat_keys = [nlp(w) for w in cat]

  search_tokens = (gt_keys,mec_keys,cat_keys,fam_keys)

  return current_keys, search_tokens
   

#-----------

#reading in raw file & removing unranked and compilation game items
df = pd.read_json(r'./bgg_GameItem.jl', lines=True)
df['rank'] = df['rank'].fillna(0).astype(int)
df = df[(df['rank']>0) & (df['compilation']!=1)]

#separating and cleaning the one-hot target columns
in_df = text_col_cleaner(frame = df[['game_type','mechanic','category','family']],
                    cols = ['game_type','mechanic','category','family'], 
                    pattern = re.compile("([\S ]+)(?=:)"))

print('Text has been cleaned, now encoding one-hot columns')

#encoding one-hot columns and rejoining to features for output
proc_df = encode_columns(in_df)
step = df[['name','description','cooperative']]
join_df = pd.concat([step,proc_df.drop(['game_type','mechanic','category','family',
    'game_type_Amiga','game_type_Arcade','game_type_Atari ST',
    'game_type_Commodore 64'],axis=1)],axis=1)

print('Columns encoded, now performing english language detection and cleanup')

#english language detection steps & first data save
eng_df = lang_cleanup(join_df)
eng_df = eng_df.loc[:,~eng_df.columns.duplicated()].copy().reset_index(drop=True).fillna(0)

print('Creating vector-only dataframe & saving output')

#vector only data for operations
vector_df = eng_df.copy().drop(['name','description','cleaned_descriptions','lang'],axis=1)

eng_df.to_parquet('game_data.parquet.gzip',compression='gzip')
vector_df.to_parquet('game_vectors.parquet.gzip',compression='gzip')

print('Creating key lists')

#creating key lists - 1. string list of values by feature class for defining input selections & 2. nlp processed list for unknown input search
keys, search_toks = key_collator(vector_df)

with gzip.open("current_keys.gz", "wb") as f:
    pickle.dump(keys, f)
f.close()

with gzip.open("key_search_tokens.gz", "wb") as f:
    pickle.dump(search_toks, f)
f.close()

print('File creation is complete')