Auto-BoardGame / title_generator.py
Nick Canu
final app push
02c682c
raw
history blame
6.37 kB
import pandas as pd
import re
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
from gensim.parsing import preprocess_string, strip_tags, strip_numeric, strip_multiple_whitespaces, stem_text, strip_punctuation, remove_stopwords
import spacy
import torch
from transformers import T5ForConditionalGeneration,T5Tokenizer
import random
from operator import itemgetter
#Custom text tokenizer from https://github.com/canunj/deconstructing_games by N Canu & K Chen
def doc_text_preprocessing(ser):
nlp=spacy.load("en_core_web_md", exclude=['parser','ner','textcat'])
"""text processing steps"""
import re
stop_words=set(stopwords.words('english'))
single_letter_replace=lambda c: re.sub("\s+\w{1}\s+|\n|-|—",'',c)
to_lower_func=lambda c: c.lower()
lemma_text=[preprocess_string(
' '.join([token.lemma_ for token in desc]
),[remove_stopwords,strip_numeric,strip_punctuation,strip_tags,
strip_multiple_whitespaces,single_letter_replace,to_lower_func]
) for desc in ser.apply(lambda x: nlp(x))]
tokenize_text=[[word for word in string if word not in stop_words] for string in lemma_text]
return tokenize_text
class Title_Generator:
def __init__(self, path, df):
self.model = T5ForConditionalGeneration.from_pretrained(path)
self.tokenizer = T5Tokenizer.from_pretrained(path)
self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
self.model.to(self.device)
self.game_df = df
self.title_iter = -1
self.out_titles = None
self.best_title = None
self.description = None
self.nlp = spacy.load("en_core_web_md")
def candidate_generator(self, description):
text = "headline: " + description
encoding = self.tokenizer.encode_plus(text, return_tensors = "pt")
input_ids = encoding["input_ids"].to(self.device)
attention_masks = encoding["attention_mask"].to(self.device)
candidates = []
beam_outputs = self.model.generate(
input_ids = input_ids,
attention_mask = attention_masks,
max_length = 64,
num_beams = 16,
num_beam_groups=4,
num_return_sequences=8,
diversity_penalty=.1,
repetition_penalty=.9,
early_stopping = True)
for result in beam_outputs:
res = self.tokenizer.decode(result).replace('<pad> ','').replace('</s>','').replace('<pad>','')
candidates.append(res)
return candidates, description
def candidate_score(self,candidates,ex_check=None):
if ex_check != None:
pat = re.compile("((?:" + "|".join(map(re.escape, candidates[0]+[cand.upper() for cand in candidates[0]])) + "|" + "|".join(ex_check) +"))")
desc = re.sub(pat, "__", candidates[1])
else:
pat = re.compile("((?:" + "|".join(map(re.escape, candidates[0]+[cand.upper() for cand in candidates[0]])) + "))")
desc = re.sub(pat, "__", candidates[1])
if re.search(re.compile(re.escape("__")), desc):
reg = re.compile("("+"|".join(ex_check) + ")")
hold = candidates[0]
gen_desc = re.sub(re.compile(re.escape("__")),"",desc)
candidates = self.candidate_generator(gen_desc)
next = [cand for cand in candidates[0]+hold if not reg.search(cand)]
candidates = (next, desc)
#check for existing games and duplicates
#transform function from https://stackoverflow.com/questions/42165779/python-how-to-remove-duplicate-valuescase-insensitive-from-a-list-with-same-o
def transform(L):
S = set(L)
return [item.title() for item in L if item.lower() not in S and not S.add(item.lower())]
clean_cand_step = list(set([game[0] for game in list(zip(candidates[0],[len(self.game_df[self.game_df.name.isin([x])]) for x in candidates[0]])) if game[1]==0]))
clean_cand_step = transform(clean_cand_step)
clean_cand_step = [re.sub(re.compile("(?<=[ ])And(?=[ ])"),'and',
re.sub(re.compile('(?<=\S) (([(]|\b)[Ss]econd [Ee]dition([)]|\b)|[Ss]econd [Ee]dition|2[Nn][Dd] [Ee]dition|([(]|\b)[Tt]hird [Ee]dition([)]|\b)|3[Rr][Dd] [Ee]dition)|["]Second Edition["]'),"",
re.sub(re.compile("(?<=[a-z])'S"),"'s",
re.sub(re.compile("(?<=[ ])Of(?=[ ])"),"of",x))))
for x in clean_cand_step]
clean_cand = []
for cand in clean_cand_step:
try:
inter = cand.split(":")
if inter[0].lower()==inter[1].lower():
clean_cand.append(inter[0])
else:
clean_cand.append(cand)
except:
clean_cand.append(cand)
#text processing
token_cand = doc_text_preprocessing(pd.Series(clean_cand))
token_art = doc_text_preprocessing(pd.Series([candidates[1]]))
sim = [self.nlp(title) for title in [" ".join(title) for title in token_cand]]
doc = self.nlp(" ".join(token_art[0]))
#scores cosine similarity between generated titles and body text, if the word is unknown (i.e. generator knows it but spacy doesn't)
#it assigns a random probability to populate
scores = [x if x !=0 else random.uniform(.3, .7) for x in [tok.similarity(doc) for tok in sim]]
out_titles = sorted(list(zip(clean_cand,scores)),key=itemgetter(1),reverse=True)
pat = re.compile("(?<=[!.?])(?=[^\s])")
pat2 = re.compile("([Ff]rom the [Pp]ublisher[: ]|[Ff]rom the [Dd]esigner[: ]|[Gg]ame [Dd]escription)")
pat3 = re.compile(": [Tt]he [Gg]ame: [Tt]he [Gg]ame|: [Tt]he [Gg]ame")
pat4 = re.compile("[Tt]he __")
pat5 = re.compile("__ [Gg]ame")
pat6 = re.compile("[Tt]he [Gg]ame [Oo]f __")
desc = re.sub(pat," ",candidates[1])
desc = re.sub(pat2,"",desc)
desc = re.sub(pat3,"",desc)
desc = re.sub(pat4,"__",desc)
desc = re.sub(pat5,"__",desc)
desc = re.sub(pat6,"__",desc)
return {'text':desc,'titles':out_titles}