import pandas as pd import numpy as np import re import nltk from nltk.corpus import stopwords from gensim.parsing import preprocess_string, strip_tags, strip_numeric, strip_multiple_whitespaces, stem_text, strip_punctuation, remove_stopwords import spacy from langdetect import detect import pickle import gzip nltk.download('stopwords') #function definitions #strips values out of encoded stream lists def text_col_cleaner(frame, cols, pattern): pattern = re.compile(pattern) for col in cols: frame[col] = frame[col].map(lambda x: [re.findall(pattern,val)[0].strip() for val in x], na_action='ignore') return frame #converts specified columns to one-hot def encode_columns(frame): targets = list(frame.columns) for t in targets: one_hot = pd.get_dummies(frame[t].apply(pd.Series).stack(),prefix=t).groupby(level=0).sum() frame = pd.concat([frame,one_hot],axis=1) return frame #custom text processor for tokenizing descriptions by Kuan Chen & Nick Canu def doc_text_preprocessing(ser): nlp=spacy.load("en_core_web_sm", exclude=['parser','ner','textcat']) """text processing steps""" stop_words=set(stopwords.words('english')) stop_words.update(['game','player','players','games', 'also', 'description','publisher']) single_letter_replace=lambda c: re.sub("\s+\w{1}\s+|\n|-|—",'',c) to_lower_func=lambda c: c.lower() lemma_text=[preprocess_string( ' '.join([token.lemma_ for token in desc] ),[remove_stopwords,strip_numeric,strip_punctuation,strip_tags, strip_multiple_whitespaces,single_letter_replace,to_lower_func] ) for desc in ser.apply(lambda x: nlp(x))] tokenize_text=[[word for word in string if word not in stop_words] for string in lemma_text] return tokenize_text #performs english language detection on the descriptions w/langdetect then additionally drops games using non-english characters in the name def lang_cleanup(frame): nlp=spacy.load("en_core_web_sm") frame['description']=frame['description'].fillna('no words') frame = frame[frame['description']!='no words'] frame['cleaned_descriptions']=doc_text_preprocessing(frame['description']) detected_lang = [] for word in frame.cleaned_descriptions: word=', '.join(word) detected_lang.append(detect(word)) frame['lang'] = detected_lang frame = frame[frame['lang']=='en'] non_eng_title_filter = frame['name'].str.contains('[^\x00-\x7f]', flags=re.IGNORECASE) return frame[~non_eng_title_filter] #column name stripper for creating key values def column_fixer(frame,targ): return [col.replace(targ, "").strip('"') for col in frame.columns if col.startswith(targ)] #creates key list for defining web app lists & nlp tokens of the same unknown input search def key_collator(frame): nlp=spacy.load("en_core_web_sm") fam = column_fixer(frame,'family_') gt = column_fixer(frame,'game_type_') mec = column_fixer(frame,'mechanic_') cat = column_fixer(frame,'category_') current_keys = (['cooperative'],gt,mec,cat,fam) fam_keys = [nlp(w) for w in fam] gt_keys = [nlp(w) for w in gt] mec_keys = [nlp(w) for w in mec] cat_keys = [nlp(w) for w in cat] search_tokens = (gt_keys,mec_keys,cat_keys,fam_keys) return current_keys, search_tokens #----------- #reading in raw file & removing unranked and compilation game items df = pd.read_json(r'./bgg_GameItem.jl', lines=True) df['rank'] = df['rank'].fillna(0).astype(int) df = df[(df['rank']>0) & (df['compilation']!=1)] #separating and cleaning the one-hot target columns in_df = text_col_cleaner(frame = df[['game_type','mechanic','category','family']], cols = ['game_type','mechanic','category','family'], pattern = re.compile("([\S ]+)(?=:)")) print('Text has been cleaned, now encoding one-hot columns') #encoding one-hot columns and rejoining to features for output proc_df = encode_columns(in_df) step = df[['name','description','cooperative']] join_df = pd.concat([step,proc_df.drop(['game_type','mechanic','category','family', 'game_type_Amiga','game_type_Arcade','game_type_Atari ST', 'game_type_Commodore 64'],axis=1)],axis=1) print('Columns encoded, now performing english language detection and cleanup') #english language detection steps & first data save eng_df = lang_cleanup(join_df) eng_df = eng_df.loc[:,~eng_df.columns.duplicated()].copy().reset_index(drop=True).fillna(0) print('Creating vector-only dataframe & saving output') #vector only data for operations vector_df = eng_df.copy().drop(['name','description','cleaned_descriptions','lang'],axis=1) eng_df.to_parquet('game_data.parquet.gzip',compression='gzip') vector_df.to_parquet('game_vectors.parquet.gzip',compression='gzip') print('Creating key lists') #creating key lists - 1. string list of values by feature class for defining input selections & 2. nlp processed list for unknown input search keys, search_toks = key_collator(vector_df) with gzip.open("current_keys.gz", "wb") as f: pickle.dump(keys, f) f.close() with gzip.open("key_search_tokens.gz", "wb") as f: pickle.dump(search_toks, f) f.close() print('File creation is complete')