File size: 6,371 Bytes
b0829c1
 
d7ea903
 
98f8d85
b0829c1
 
 
 
ae88252
a02dc94
b0829c1
 
 
ae88252
b0829c1
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
26b17df
b0829c1
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
02c682c
 
 
 
 
 
1a371f7
26b17df
b0829c1
02c682c
b0829c1
 
 
02c682c
b0829c1
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
26b17df
 
b0829c1
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
import pandas as pd
import re
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
from gensim.parsing import preprocess_string, strip_tags, strip_numeric, strip_multiple_whitespaces, stem_text, strip_punctuation, remove_stopwords
import spacy
import torch
from transformers import T5ForConditionalGeneration,T5Tokenizer
import random
from operator import itemgetter

#Custom text tokenizer from https://github.com/canunj/deconstructing_games by N Canu & K Chen
def doc_text_preprocessing(ser):
    nlp=spacy.load("en_core_web_md", exclude=['parser','ner','textcat'])

    """text processing steps"""
    import re
    stop_words=set(stopwords.words('english'))
        
    single_letter_replace=lambda c: re.sub("\s+\w{1}\s+|\n|-|—",'',c)
    to_lower_func=lambda c: c.lower()
    lemma_text=[preprocess_string(
        ' '.join([token.lemma_ for token in desc]
            ),[remove_stopwords,strip_numeric,strip_punctuation,strip_tags,
               strip_multiple_whitespaces,single_letter_replace,to_lower_func]
               ) for desc in ser.apply(lambda x: nlp(x))]

    tokenize_text=[[word for word in string if word not in stop_words] for string in lemma_text]

    return tokenize_text

class Title_Generator:

    def __init__(self, path, df):
        self.model = T5ForConditionalGeneration.from_pretrained(path)
        self.tokenizer = T5Tokenizer.from_pretrained(path)
        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        self.model.to(self.device)
        self.game_df = df
        
        self.title_iter = -1
        self.out_titles = None
        self.best_title = None
        self.description = None
        self.nlp = spacy.load("en_core_web_md")


    def candidate_generator(self, description):
        text =  "headline: " + description

        encoding = self.tokenizer.encode_plus(text, return_tensors = "pt")
        input_ids = encoding["input_ids"].to(self.device)
        attention_masks = encoding["attention_mask"].to(self.device)

        candidates = []

        beam_outputs = self.model.generate(
            input_ids = input_ids,
            attention_mask = attention_masks,
            max_length = 64,
            num_beams = 16,
            num_beam_groups=4,
            num_return_sequences=8,
            diversity_penalty=.1,
            repetition_penalty=.9,
            early_stopping = True)

        for result in beam_outputs:
            res = self.tokenizer.decode(result).replace('<pad> ','').replace('</s>','').replace('<pad>','')
            candidates.append(res)
        
        return candidates, description
    
    def candidate_score(self,candidates,ex_check=None):
        
        
        if ex_check != None:
            pat = re.compile("((?:" + "|".join(map(re.escape, candidates[0]+[cand.upper() for cand in candidates[0]])) + "|" + "|".join(ex_check) +"))")
            desc = re.sub(pat, "__", candidates[1])
        else:
            pat = re.compile("((?:" + "|".join(map(re.escape, candidates[0]+[cand.upper() for cand in candidates[0]])) + "))")
            desc = re.sub(pat, "__", candidates[1])

        
        if re.search(re.compile(re.escape("__")), desc):
            reg = re.compile("("+"|".join(ex_check) + ")")
            hold = candidates[0]
            gen_desc = re.sub(re.compile(re.escape("__")),"",desc)
            candidates = self.candidate_generator(gen_desc)
            next = [cand for cand in candidates[0]+hold if not reg.search(cand)]
            candidates = (next, desc)

        #check for existing games and duplicates
        #transform function from https://stackoverflow.com/questions/42165779/python-how-to-remove-duplicate-valuescase-insensitive-from-a-list-with-same-o
        def transform(L):
            S = set(L)
            return [item.title() for item in L if item.lower() not in S and not S.add(item.lower())]


        clean_cand_step = list(set([game[0] for game in list(zip(candidates[0],[len(self.game_df[self.game_df.name.isin([x])]) for x in candidates[0]])) if game[1]==0]))
        clean_cand_step = transform(clean_cand_step)

        clean_cand_step = [re.sub(re.compile("(?<=[ ])And(?=[ ])"),'and',
                                  re.sub(re.compile('(?<=\S) (([(]|\b)[Ss]econd [Ee]dition([)]|\b)|[Ss]econd [Ee]dition|2[Nn][Dd] [Ee]dition|([(]|\b)[Tt]hird [Ee]dition([)]|\b)|3[Rr][Dd] [Ee]dition)|["]Second Edition["]'),"",
                                  re.sub(re.compile("(?<=[a-z])'S"),"'s",
                                  re.sub(re.compile("(?<=[ ])Of(?=[ ])"),"of",x)))) 
                                  for x in clean_cand_step]

        
        clean_cand = []
        for cand in clean_cand_step:
            try: 
                inter = cand.split(":")
                if inter[0].lower()==inter[1].lower():
                    clean_cand.append(inter[0])
                else:
                    clean_cand.append(cand)
            except:
                clean_cand.append(cand)

        #text processing
        token_cand = doc_text_preprocessing(pd.Series(clean_cand))
        token_art = doc_text_preprocessing(pd.Series([candidates[1]]))
        sim = [self.nlp(title) for title in [" ".join(title) for title in token_cand]]
        doc = self.nlp(" ".join(token_art[0]))

        #scores cosine similarity between generated titles and body text, if the word is unknown (i.e. generator knows it but spacy doesn't)
        #it assigns a random probability to populate

        scores = [x if x !=0 else random.uniform(.3, .7) for x in [tok.similarity(doc) for tok in sim]]
        
        out_titles = sorted(list(zip(clean_cand,scores)),key=itemgetter(1),reverse=True)
   
        pat = re.compile("(?<=[!.?])(?=[^\s])")
        pat2 = re.compile("([Ff]rom the [Pp]ublisher[: ]|[Ff]rom the [Dd]esigner[: ]|[Gg]ame [Dd]escription)")
        pat3 = re.compile(": [Tt]he [Gg]ame: [Tt]he [Gg]ame|: [Tt]he [Gg]ame")
        pat4 = re.compile("[Tt]he __")
        pat5 = re.compile("__ [Gg]ame")
        pat6 = re.compile("[Tt]he [Gg]ame [Oo]f __")
        
        desc = re.sub(pat," ",candidates[1])   
        desc = re.sub(pat2,"",desc)
        desc = re.sub(pat3,"",desc)
        desc = re.sub(pat4,"__",desc)
        desc = re.sub(pat5,"__",desc)
        desc = re.sub(pat6,"__",desc)

        return {'text':desc,'titles':out_titles}