Spaces:

AutoBG
/

Auto-BoardGame

Runtime error

Auto-BoardGame / title_generator.py

Nick Canu

final app push

02c682c over 1 year ago

6.37 kB

	import pandas as pd
	import re
	import nltk
	nltk.download('stopwords')
	from nltk.corpus import stopwords
	from gensim.parsing import preprocess_string, strip_tags, strip_numeric, strip_multiple_whitespaces, stem_text, strip_punctuation, remove_stopwords
	import spacy
	import torch
	from transformers import T5ForConditionalGeneration,T5Tokenizer
	import random
	from operator import itemgetter

	#Custom text tokenizer from https://github.com/canunj/deconstructing_games by N Canu & K Chen
	def doc_text_preprocessing(ser):
	nlp=spacy.load("en_core_web_md", exclude=['parser','ner','textcat'])

	"""text processing steps"""
	import re
	stop_words=set(stopwords.words('english'))

	single_letter_replace=lambda c: re.sub("\s+\w{1}\s+\|\n\|-\|—",'',c)
	to_lower_func=lambda c: c.lower()
	lemma_text=[preprocess_string(
	' '.join([token.lemma_ for token in desc]
	),[remove_stopwords,strip_numeric,strip_punctuation,strip_tags,
	strip_multiple_whitespaces,single_letter_replace,to_lower_func]
	) for desc in ser.apply(lambda x: nlp(x))]

	tokenize_text=[[word for word in string if word not in stop_words] for string in lemma_text]

	return tokenize_text

	class Title_Generator:

	def __init__(self, path, df):
	self.model = T5ForConditionalGeneration.from_pretrained(path)
	self.tokenizer = T5Tokenizer.from_pretrained(path)
	self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
	self.model.to(self.device)
	self.game_df = df

	self.title_iter = -1
	self.out_titles = None
	self.best_title = None
	self.description = None
	self.nlp = spacy.load("en_core_web_md")


	def candidate_generator(self, description):
	text = "headline: " + description

	encoding = self.tokenizer.encode_plus(text, return_tensors = "pt")
	input_ids = encoding["input_ids"].to(self.device)
	attention_masks = encoding["attention_mask"].to(self.device)

	candidates = []

	beam_outputs = self.model.generate(
	input_ids = input_ids,
	attention_mask = attention_masks,
	max_length = 64,
	num_beams = 16,
	num_beam_groups=4,
	num_return_sequences=8,
	diversity_penalty=.1,
	repetition_penalty=.9,
	early_stopping = True)

	for result in beam_outputs:
	res = self.tokenizer.decode(result).replace('<pad> ','').replace('</s>','').replace('<pad>','')
	candidates.append(res)

	return candidates, description

	def candidate_score(self,candidates,ex_check=None):


	if ex_check != None:
	pat = re.compile("((?:" + "\|".join(map(re.escape, candidates[0]+[cand.upper() for cand in candidates[0]])) + "\|" + "\|".join(ex_check) +"))")
	desc = re.sub(pat, "__", candidates[1])
	else:
	pat = re.compile("((?:" + "\|".join(map(re.escape, candidates[0]+[cand.upper() for cand in candidates[0]])) + "))")
	desc = re.sub(pat, "__", candidates[1])


	if re.search(re.compile(re.escape("__")), desc):
	reg = re.compile("("+"\|".join(ex_check) + ")")
	hold = candidates[0]
	gen_desc = re.sub(re.compile(re.escape("__")),"",desc)
	candidates = self.candidate_generator(gen_desc)
	next = [cand for cand in candidates[0]+hold if not reg.search(cand)]
	candidates = (next, desc)

	#check for existing games and duplicates
	#transform function from https://stackoverflow.com/questions/42165779/python-how-to-remove-duplicate-valuescase-insensitive-from-a-list-with-same-o
	def transform(L):
	S = set(L)
	return [item.title() for item in L if item.lower() not in S and not S.add(item.lower())]


	clean_cand_step = list(set([game[0] for game in list(zip(candidates[0],[len(self.game_df[self.game_df.name.isin([x])]) for x in candidates[0]])) if game[1]==0]))
	clean_cand_step = transform(clean_cand_step)

	clean_cand_step = [re.sub(re.compile("(?<=[ ])And(?=[ ])"),'and',
	re.sub(re.compile('(?<=\S) (([(]\|\b)[Ss]econd [Ee]dition([)]\|\b)\|[Ss]econd [Ee]dition\|2[Nn][Dd] [Ee]dition\|([(]\|\b)[Tt]hird [Ee]dition([)]\|\b)\|3[Rr][Dd] [Ee]dition)\|["]Second Edition["]'),"",
	re.sub(re.compile("(?<=[a-z])'S"),"'s",
	re.sub(re.compile("(?<=[ ])Of(?=[ ])"),"of",x))))
	for x in clean_cand_step]


	clean_cand = []
	for cand in clean_cand_step:
	try:
	inter = cand.split(":")
	if inter[0].lower()==inter[1].lower():
	clean_cand.append(inter[0])
	else:
	clean_cand.append(cand)
	except:
	clean_cand.append(cand)

	#text processing
	token_cand = doc_text_preprocessing(pd.Series(clean_cand))
	token_art = doc_text_preprocessing(pd.Series([candidates[1]]))
	sim = [self.nlp(title) for title in [" ".join(title) for title in token_cand]]
	doc = self.nlp(" ".join(token_art[0]))

	#scores cosine similarity between generated titles and body text, if the word is unknown (i.e. generator knows it but spacy doesn't)
	#it assigns a random probability to populate

	scores = [x if x !=0 else random.uniform(.3, .7) for x in [tok.similarity(doc) for tok in sim]]

	out_titles = sorted(list(zip(clean_cand,scores)),key=itemgetter(1),reverse=True)

	pat = re.compile("(?<=[!.?])(?=[^\s])")
	pat2 = re.compile("([Ff]rom the [Pp]ublisher[: ]\|[Ff]rom the [Dd]esigner[: ]\|[Gg]ame [Dd]escription)")
	pat3 = re.compile(": [Tt]he [Gg]ame: [Tt]he [Gg]ame\|: [Tt]he [Gg]ame")
	pat4 = re.compile("[Tt]he __")
	pat5 = re.compile("__ [Gg]ame")
	pat6 = re.compile("[Tt]he [Gg]ame [Oo]f __")

	desc = re.sub(pat," ",candidates[1])
	desc = re.sub(pat2,"",desc)
	desc = re.sub(pat3,"",desc)
	desc = re.sub(pat4,"__",desc)
	desc = re.sub(pat5,"__",desc)
	desc = re.sub(pat6,"__",desc)

	return {'text':desc,'titles':out_titles}