Spaces:
Runtime error
Runtime error
import pandas as pd | |
import numpy as np | |
import re | |
import nltk | |
from nltk.corpus import stopwords | |
from gensim.parsing import preprocess_string, strip_tags, strip_numeric, strip_multiple_whitespaces, stem_text, strip_punctuation, remove_stopwords | |
import spacy | |
from langdetect import detect | |
import pickle | |
import gzip | |
nltk.download('stopwords') | |
#function definitions | |
#strips values out of encoded stream lists | |
def text_col_cleaner(frame, cols, pattern): | |
pattern = re.compile(pattern) | |
for col in cols: | |
frame[col] = frame[col].map(lambda x: [re.findall(pattern,val)[0].strip() for val in x], na_action='ignore') | |
return frame | |
#converts specified columns to one-hot | |
def encode_columns(frame): | |
targets = list(frame.columns) | |
for t in targets: | |
one_hot = pd.get_dummies(frame[t].apply(pd.Series).stack(),prefix=t).groupby(level=0).sum() | |
frame = pd.concat([frame,one_hot],axis=1) | |
return frame | |
#custom text processor for tokenizing descriptions by Kuan Chen & Nick Canu | |
def doc_text_preprocessing(ser): | |
nlp=spacy.load("en_core_web_sm", exclude=['parser','ner','textcat']) | |
"""text processing steps""" | |
stop_words=set(stopwords.words('english')) | |
stop_words.update(['game','player','players','games', 'also', | |
'description','publisher']) | |
single_letter_replace=lambda c: re.sub("\s+\w{1}\s+|\n|-|—",'',c) | |
to_lower_func=lambda c: c.lower() | |
lemma_text=[preprocess_string( | |
' '.join([token.lemma_ for token in desc] | |
),[remove_stopwords,strip_numeric,strip_punctuation,strip_tags, | |
strip_multiple_whitespaces,single_letter_replace,to_lower_func] | |
) for desc in ser.apply(lambda x: nlp(x))] | |
tokenize_text=[[word for word in string if word not in stop_words] for string in lemma_text] | |
return tokenize_text | |
#performs english language detection on the descriptions w/langdetect then additionally drops games using non-english characters in the name | |
def lang_cleanup(frame): | |
nlp=spacy.load("en_core_web_sm") | |
frame['description']=frame['description'].fillna('no words') | |
frame = frame[frame['description']!='no words'] | |
frame['cleaned_descriptions']=doc_text_preprocessing(frame['description']) | |
detected_lang = [] | |
for word in frame.cleaned_descriptions: | |
word=', '.join(word) | |
detected_lang.append(detect(word)) | |
frame['lang'] = detected_lang | |
frame = frame[frame['lang']=='en'] | |
non_eng_title_filter = frame['name'].str.contains('[^\x00-\x7f]', flags=re.IGNORECASE) | |
return frame[~non_eng_title_filter] | |
#column name stripper for creating key values | |
def column_fixer(frame,targ): | |
return [col.replace(targ, "").strip('"') for col in frame.columns if col.startswith(targ)] | |
#creates key list for defining web app lists & nlp tokens of the same unknown input search | |
def key_collator(frame): | |
nlp=spacy.load("en_core_web_sm") | |
fam = column_fixer(frame,'family_') | |
gt = column_fixer(frame,'game_type_') | |
mec = column_fixer(frame,'mechanic_') | |
cat = column_fixer(frame,'category_') | |
current_keys = (['cooperative'],gt,mec,cat,fam) | |
fam_keys = [nlp(w) for w in fam] | |
gt_keys = [nlp(w) for w in gt] | |
mec_keys = [nlp(w) for w in mec] | |
cat_keys = [nlp(w) for w in cat] | |
search_tokens = (gt_keys,mec_keys,cat_keys,fam_keys) | |
return current_keys, search_tokens | |
#----------- | |
#reading in raw file & removing unranked and compilation game items | |
df = pd.read_json(r'./bgg_GameItem.jl', lines=True) | |
df['rank'] = df['rank'].fillna(0).astype(int) | |
df = df[(df['rank']>0) & (df['compilation']!=1)] | |
#separating and cleaning the one-hot target columns | |
in_df = text_col_cleaner(frame = df[['game_type','mechanic','category','family']], | |
cols = ['game_type','mechanic','category','family'], | |
pattern = re.compile("([\S ]+)(?=:)")) | |
print('Text has been cleaned, now encoding one-hot columns') | |
#encoding one-hot columns and rejoining to features for output | |
proc_df = encode_columns(in_df) | |
step = df[['name','description','cooperative']] | |
join_df = pd.concat([step,proc_df.drop(['game_type','mechanic','category','family', | |
'game_type_Amiga','game_type_Arcade','game_type_Atari ST', | |
'game_type_Commodore 64'],axis=1)],axis=1) | |
print('Columns encoded, now performing english language detection and cleanup') | |
#english language detection steps & first data save | |
eng_df = lang_cleanup(join_df) | |
eng_df = eng_df.loc[:,~eng_df.columns.duplicated()].copy().reset_index(drop=True).fillna(0) | |
print('Creating vector-only dataframe & saving output') | |
#vector only data for operations | |
vector_df = eng_df.copy().drop(['name','description','cleaned_descriptions','lang'],axis=1) | |
eng_df.to_parquet('game_data.parquet.gzip',compression='gzip') | |
vector_df.to_parquet('game_vectors.parquet.gzip',compression='gzip') | |
print('Creating key lists') | |
#creating key lists - 1. string list of values by feature class for defining input selections & 2. nlp processed list for unknown input search | |
keys, search_toks = key_collator(vector_df) | |
with gzip.open("current_keys.gz", "wb") as f: | |
pickle.dump(keys, f) | |
f.close() | |
with gzip.open("key_search_tokens.gz", "wb") as f: | |
pickle.dump(search_toks, f) | |
f.close() | |
print('File creation is complete') |