Spaces:
Runtime error
Runtime error
File size: 6,188 Bytes
b0829c1 d7ea903 98f8d85 b0829c1 ae88252 a02dc94 b0829c1 ae88252 b0829c1 26b17df b0829c1 1b91973 1a371f7 26b17df b0829c1 1b91973 b0829c1 1b91973 b0829c1 26b17df b0829c1 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 |
import pandas as pd
import re
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
from gensim.parsing import preprocess_string, strip_tags, strip_numeric, strip_multiple_whitespaces, stem_text, strip_punctuation, remove_stopwords
import spacy
import torch
from transformers import T5ForConditionalGeneration,T5Tokenizer
import random
from operator import itemgetter
#Custom text tokenizer from https://github.com/canunj/deconstructing_games by N Canu & K Chen
def doc_text_preprocessing(ser):
nlp=spacy.load("en_core_web_md", exclude=['parser','ner','textcat'])
"""text processing steps"""
import re
stop_words=set(stopwords.words('english'))
single_letter_replace=lambda c: re.sub("\s+\w{1}\s+|\n|-|—",'',c)
to_lower_func=lambda c: c.lower()
lemma_text=[preprocess_string(
' '.join([token.lemma_ for token in desc]
),[remove_stopwords,strip_numeric,strip_punctuation,strip_tags,
strip_multiple_whitespaces,single_letter_replace,to_lower_func]
) for desc in ser.apply(lambda x: nlp(x))]
tokenize_text=[[word for word in string if word not in stop_words] for string in lemma_text]
return tokenize_text
class Title_Generator:
def __init__(self, path, df):
self.model = T5ForConditionalGeneration.from_pretrained(path)
self.tokenizer = T5Tokenizer.from_pretrained(path)
self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
self.model.to(self.device)
self.game_df = df
self.title_iter = -1
self.out_titles = None
self.best_title = None
self.description = None
self.nlp = spacy.load("en_core_web_md")
def candidate_generator(self, description):
text = "headline: " + description
encoding = self.tokenizer.encode_plus(text, return_tensors = "pt")
input_ids = encoding["input_ids"].to(self.device)
attention_masks = encoding["attention_mask"].to(self.device)
candidates = []
beam_outputs = self.model.generate(
input_ids = input_ids,
attention_mask = attention_masks,
max_length = 64,
num_beams = 16,
num_beam_groups=4,
num_return_sequences=8,
diversity_penalty=.1,
repetition_penalty=.9,
early_stopping = True)
for result in beam_outputs:
res = self.tokenizer.decode(result).replace('<pad> ','').replace('</s>','').replace('<pad>','')
candidates.append(res)
return candidates, description
def candidate_score(self,candidates,ex_check=None):
pat = re.compile("((?:" + "|".join(map(re.escape, candidates[0]+[cand.upper() for cand in candidates[0]])) + "))")
desc = re.sub(pat, "__", candidates[1])
if re.search(re.compile(re.escape("__")), desc):
hold = candidates[0]
gen_desc = re.sub(re.compile(re.escape("__")),"",desc)
candidates = self.candidate_generator(gen_desc)
next = candidates[0]+hold
candidates = (next, desc)
reg = re.compile("("+"|".join(ex_check) + ")")
step = [cand for cand in candidates[0] if not reg.search(cand)]
candidates = (step,candidates[1])
#check for existing games and duplicates
#transform function from https://stackoverflow.com/questions/42165779/python-how-to-remove-duplicate-valuescase-insensitive-from-a-list-with-same-o
def transform(L):
S = set(L)
return [item.title() for item in L if item.lower() not in S and not S.add(item.lower())]
clean_cand_step = list(set([game[0] for game in list(zip(candidates[0],[len(self.game_df[self.game_df.name.isin([x])]) for x in candidates[0]])) if game[1]==0]))
clean_cand_step = transform(clean_cand_step)
clean_cand_step = [re.sub(re.compile("(?<=[ ])And(?=[ ])"),'and',
re.sub(re.compile('(?<=\S) (([(]|\b)[Ss]econd [Ee]dition([)]|\b)|[Ss]econd [Ee]dition|2[Nn][Dd] [Ee]dition|([(]|\b)[Tt]hird [Ee]dition([)]|\b)|3[Rr][Dd] [Ee]dition)|["]Second Edition["]'),"",
re.sub(re.compile("(?<=[a-z])'S"),"'s",
re.sub(re.compile("(?<=[ ])Of(?=[ ])"),"of",x))))
for x in clean_cand_step]
clean_cand = []
for cand in clean_cand_step:
try:
inter = cand.split(":")
if inter[0].lower()==inter[1].lower():
clean_cand.append(inter[0])
else:
clean_cand.append(cand)
except:
clean_cand.append(cand)
#text processing
token_cand = doc_text_preprocessing(pd.Series(clean_cand))
token_art = doc_text_preprocessing(pd.Series([candidates[1]]))
sim = [self.nlp(title) for title in [" ".join(title) for title in token_cand]]
doc = self.nlp(" ".join(token_art[0]))
#scores cosine similarity between generated titles and body text, if the word is unknown (i.e. generator knows it but spacy doesn't)
#it assigns a random probability to populate
scores = [x if x !=0 else random.uniform(.3, .7) for x in [tok.similarity(doc) for tok in sim]]
out_titles = sorted(list(zip(clean_cand,scores)),key=itemgetter(1),reverse=True)
pat = re.compile("(?<=[!.?])(?=[^\s])")
pat2 = re.compile("([Ff]rom the [Pp]ublisher[: ]|[Ff]rom the [Dd]esigner[: ]|[Gg]ame [Dd]escription)")
pat3 = re.compile(": [Tt]he [Gg]ame: [Tt]he [Gg]ame|: [Tt]he [Gg]ame")
pat4 = re.compile("[Tt]he __")
pat5 = re.compile("__ [Gg]ame")
pat6 = re.compile("[Tt]he [Gg]ame [Oo]f __")
desc = re.sub(pat," ",candidates[1])
desc = re.sub(pat2,"",desc)
desc = re.sub(pat3,"",desc)
desc = re.sub(pat4,"__",desc)
desc = re.sub(pat5,"__",desc)
desc = re.sub(pat6,"__",desc)
return {'text':desc,'titles':out_titles}
|