Spaces:

AutoBG
/

Auto-BoardGame

Running

Auto-BoardGame / Stream_to_Output /GameCleaner.py

Nick Canu

add app

b0829c1 over 1 year ago

No virus

5.21 kB

	import pandas as pd
	import numpy as np
	import re
	import nltk
	from nltk.corpus import stopwords
	from gensim.parsing import preprocess_string, strip_tags, strip_numeric, strip_multiple_whitespaces, stem_text, strip_punctuation, remove_stopwords
	import spacy
	from langdetect import detect
	import pickle
	import gzip
	nltk.download('stopwords')

	#function definitions

	#strips values out of encoded stream lists
	def text_col_cleaner(frame, cols, pattern):

	pattern = re.compile(pattern)

	for col in cols:
	frame[col] = frame[col].map(lambda x: [re.findall(pattern,val)[0].strip() for val in x], na_action='ignore')
	return frame

	#converts specified columns to one-hot
	def encode_columns(frame):
	targets = list(frame.columns)
	for t in targets:
	one_hot = pd.get_dummies(frame[t].apply(pd.Series).stack(),prefix=t).groupby(level=0).sum()
	frame = pd.concat([frame,one_hot],axis=1)
	return frame

	#custom text processor for tokenizing descriptions by Kuan Chen & Nick Canu
	def doc_text_preprocessing(ser):
	nlp=spacy.load("en_core_web_sm", exclude=['parser','ner','textcat'])

	"""text processing steps"""
	stop_words=set(stopwords.words('english'))
	stop_words.update(['game','player','players','games', 'also',
	'description','publisher'])

	single_letter_replace=lambda c: re.sub("\s+\w{1}\s+\|\n\|-\|—",'',c)
	to_lower_func=lambda c: c.lower()

	lemma_text=[preprocess_string(
	' '.join([token.lemma_ for token in desc]
	),[remove_stopwords,strip_numeric,strip_punctuation,strip_tags,
	strip_multiple_whitespaces,single_letter_replace,to_lower_func]
	) for desc in ser.apply(lambda x: nlp(x))]

	tokenize_text=[[word for word in string if word not in stop_words] for string in lemma_text]

	return tokenize_text

	#performs english language detection on the descriptions w/langdetect then additionally drops games using non-english characters in the name
	def lang_cleanup(frame):
	nlp=spacy.load("en_core_web_sm")
	frame['description']=frame['description'].fillna('no words')
	frame = frame[frame['description']!='no words']
	frame['cleaned_descriptions']=doc_text_preprocessing(frame['description'])

	detected_lang = []
	for word in frame.cleaned_descriptions:
	word=', '.join(word)
	detected_lang.append(detect(word))
	frame['lang'] = detected_lang
	frame = frame[frame['lang']=='en']

	non_eng_title_filter = frame['name'].str.contains('[^\x00-\x7f]', flags=re.IGNORECASE)
	return frame[~non_eng_title_filter]


	#column name stripper for creating key values
	def column_fixer(frame,targ):
	return [col.replace(targ, "").strip('"') for col in frame.columns if col.startswith(targ)]

	#creates key list for defining web app lists & nlp tokens of the same unknown input search
	def key_collator(frame):
	nlp=spacy.load("en_core_web_sm")
	fam = column_fixer(frame,'family_')
	gt = column_fixer(frame,'game_type_')
	mec = column_fixer(frame,'mechanic_')
	cat = column_fixer(frame,'category_')

	current_keys = (['cooperative'],gt,mec,cat,fam)

	fam_keys = [nlp(w) for w in fam]
	gt_keys = [nlp(w) for w in gt]
	mec_keys = [nlp(w) for w in mec]
	cat_keys = [nlp(w) for w in cat]

	search_tokens = (gt_keys,mec_keys,cat_keys,fam_keys)

	return current_keys, search_tokens


	#-----------

	#reading in raw file & removing unranked and compilation game items
	df = pd.read_json(r'./bgg_GameItem.jl', lines=True)
	df['rank'] = df['rank'].fillna(0).astype(int)
	df = df[(df['rank']>0) & (df['compilation']!=1)]

	#separating and cleaning the one-hot target columns
	in_df = text_col_cleaner(frame = df[['game_type','mechanic','category','family']],
	cols = ['game_type','mechanic','category','family'],
	pattern = re.compile("([\S ]+)(?=:)"))

	print('Text has been cleaned, now encoding one-hot columns')

	#encoding one-hot columns and rejoining to features for output
	proc_df = encode_columns(in_df)
	step = df[['name','description','cooperative']]
	join_df = pd.concat([step,proc_df.drop(['game_type','mechanic','category','family',
	'game_type_Amiga','game_type_Arcade','game_type_Atari ST',
	'game_type_Commodore 64'],axis=1)],axis=1)

	print('Columns encoded, now performing english language detection and cleanup')

	#english language detection steps & first data save
	eng_df = lang_cleanup(join_df)
	eng_df = eng_df.loc[:,~eng_df.columns.duplicated()].copy().reset_index(drop=True).fillna(0)

	print('Creating vector-only dataframe & saving output')

	#vector only data for operations
	vector_df = eng_df.copy().drop(['name','description','cleaned_descriptions','lang'],axis=1)

	eng_df.to_parquet('game_data.parquet.gzip',compression='gzip')
	vector_df.to_parquet('game_vectors.parquet.gzip',compression='gzip')

	print('Creating key lists')

	#creating key lists - 1. string list of values by feature class for defining input selections & 2. nlp processed list for unknown input search
	keys, search_toks = key_collator(vector_df)

	with gzip.open("current_keys.gz", "wb") as f:
	pickle.dump(keys, f)
	f.close()

	with gzip.open("key_search_tokens.gz", "wb") as f:
	pickle.dump(search_toks, f)
	f.close()

	print('File creation is complete')