Auto-BoardGame / title_generator.py
Nick Canu
final app push
02c682c
raw
history blame contribute delete
No virus
6.37 kB
import pandas as pd
import re
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
from gensim.parsing import preprocess_string, strip_tags, strip_numeric, strip_multiple_whitespaces, stem_text, strip_punctuation, remove_stopwords
import spacy
import torch
from transformers import T5ForConditionalGeneration,T5Tokenizer
import random
from operator import itemgetter
#Custom text tokenizer from https://github.com/canunj/deconstructing_games by N Canu & K Chen
def doc_text_preprocessing(ser):
nlp=spacy.load("en_core_web_md", exclude=['parser','ner','textcat'])
"""text processing steps"""
import re
stop_words=set(stopwords.words('english'))
single_letter_replace=lambda c: re.sub("\s+\w{1}\s+|\n|-|—",'',c)
to_lower_func=lambda c: c.lower()
lemma_text=[preprocess_string(
' '.join([token.lemma_ for token in desc]
),[remove_stopwords,strip_numeric,strip_punctuation,strip_tags,
strip_multiple_whitespaces,single_letter_replace,to_lower_func]
) for desc in ser.apply(lambda x: nlp(x))]
tokenize_text=[[word for word in string if word not in stop_words] for string in lemma_text]
return tokenize_text
class Title_Generator:
def __init__(self, path, df):
self.model = T5ForConditionalGeneration.from_pretrained(path)
self.tokenizer = T5Tokenizer.from_pretrained(path)
self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
self.model.to(self.device)
self.game_df = df
self.title_iter = -1
self.out_titles = None
self.best_title = None
self.description = None
self.nlp = spacy.load("en_core_web_md")
def candidate_generator(self, description):
text = "headline: " + description
encoding = self.tokenizer.encode_plus(text, return_tensors = "pt")
input_ids = encoding["input_ids"].to(self.device)
attention_masks = encoding["attention_mask"].to(self.device)
candidates = []
beam_outputs = self.model.generate(
input_ids = input_ids,
attention_mask = attention_masks,
max_length = 64,
num_beams = 16,
num_beam_groups=4,
num_return_sequences=8,
diversity_penalty=.1,
repetition_penalty=.9,
early_stopping = True)
for result in beam_outputs:
res = self.tokenizer.decode(result).replace('<pad> ','').replace('</s>','').replace('<pad>','')
candidates.append(res)
return candidates, description
def candidate_score(self,candidates,ex_check=None):
if ex_check != None:
pat = re.compile("((?:" + "|".join(map(re.escape, candidates[0]+[cand.upper() for cand in candidates[0]])) + "|" + "|".join(ex_check) +"))")
desc = re.sub(pat, "__", candidates[1])
else:
pat = re.compile("((?:" + "|".join(map(re.escape, candidates[0]+[cand.upper() for cand in candidates[0]])) + "))")
desc = re.sub(pat, "__", candidates[1])
if re.search(re.compile(re.escape("__")), desc):
reg = re.compile("("+"|".join(ex_check) + ")")
hold = candidates[0]
gen_desc = re.sub(re.compile(re.escape("__")),"",desc)
candidates = self.candidate_generator(gen_desc)
next = [cand for cand in candidates[0]+hold if not reg.search(cand)]
candidates = (next, desc)
#check for existing games and duplicates
#transform function from https://stackoverflow.com/questions/42165779/python-how-to-remove-duplicate-valuescase-insensitive-from-a-list-with-same-o
def transform(L):
S = set(L)
return [item.title() for item in L if item.lower() not in S and not S.add(item.lower())]
clean_cand_step = list(set([game[0] for game in list(zip(candidates[0],[len(self.game_df[self.game_df.name.isin([x])]) for x in candidates[0]])) if game[1]==0]))
clean_cand_step = transform(clean_cand_step)
clean_cand_step = [re.sub(re.compile("(?<=[ ])And(?=[ ])"),'and',
re.sub(re.compile('(?<=\S) (([(]|\b)[Ss]econd [Ee]dition([)]|\b)|[Ss]econd [Ee]dition|2[Nn][Dd] [Ee]dition|([(]|\b)[Tt]hird [Ee]dition([)]|\b)|3[Rr][Dd] [Ee]dition)|["]Second Edition["]'),"",
re.sub(re.compile("(?<=[a-z])'S"),"'s",
re.sub(re.compile("(?<=[ ])Of(?=[ ])"),"of",x))))
for x in clean_cand_step]
clean_cand = []
for cand in clean_cand_step:
try:
inter = cand.split(":")
if inter[0].lower()==inter[1].lower():
clean_cand.append(inter[0])
else:
clean_cand.append(cand)
except:
clean_cand.append(cand)
#text processing
token_cand = doc_text_preprocessing(pd.Series(clean_cand))
token_art = doc_text_preprocessing(pd.Series([candidates[1]]))
sim = [self.nlp(title) for title in [" ".join(title) for title in token_cand]]
doc = self.nlp(" ".join(token_art[0]))
#scores cosine similarity between generated titles and body text, if the word is unknown (i.e. generator knows it but spacy doesn't)
#it assigns a random probability to populate
scores = [x if x !=0 else random.uniform(.3, .7) for x in [tok.similarity(doc) for tok in sim]]
out_titles = sorted(list(zip(clean_cand,scores)),key=itemgetter(1),reverse=True)
pat = re.compile("(?<=[!.?])(?=[^\s])")
pat2 = re.compile("([Ff]rom the [Pp]ublisher[: ]|[Ff]rom the [Dd]esigner[: ]|[Gg]ame [Dd]escription)")
pat3 = re.compile(": [Tt]he [Gg]ame: [Tt]he [Gg]ame|: [Tt]he [Gg]ame")
pat4 = re.compile("[Tt]he __")
pat5 = re.compile("__ [Gg]ame")
pat6 = re.compile("[Tt]he [Gg]ame [Oo]f __")
desc = re.sub(pat," ",candidates[1])
desc = re.sub(pat2,"",desc)
desc = re.sub(pat3,"",desc)
desc = re.sub(pat4,"__",desc)
desc = re.sub(pat5,"__",desc)
desc = re.sub(pat6,"__",desc)
return {'text':desc,'titles':out_titles}