Spaces:

AutoBG
/

Auto-BoardGame

Running

File size: 6,371 Bytes

import pandas as pd
import re
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
from gensim.parsing import preprocess_string, strip_tags, strip_numeric, strip_multiple_whitespaces, stem_text, strip_punctuation, remove_stopwords
import spacy
import torch
from transformers import T5ForConditionalGeneration,T5Tokenizer
import random
from operator import itemgetter

#Custom text tokenizer from https://github.com/canunj/deconstructing_games by N Canu & K Chen
def doc_text_preprocessing(ser):
    nlp=spacy.load("en_core_web_md", exclude=['parser','ner','textcat'])

    """text processing steps"""
    import re
    stop_words=set(stopwords.words('english'))
        
    single_letter_replace=lambda c: re.sub("\s+\w{1}\s+|\n|-|—",'',c)
    to_lower_func=lambda c: c.lower()
    lemma_text=[preprocess_string(
        ' '.join([token.lemma_ for token in desc]
            ),[remove_stopwords,strip_numeric,strip_punctuation,strip_tags,
               strip_multiple_whitespaces,single_letter_replace,to_lower_func]
               ) for desc in ser.apply(lambda x: nlp(x))]

    tokenize_text=[[word for word in string if word not in stop_words] for string in lemma_text]

    return tokenize_text

class Title_Generator:

    def __init__(self, path, df):
        self.model = T5ForConditionalGeneration.from_pretrained(path)
        self.tokenizer = T5Tokenizer.from_pretrained(path)
        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        self.model.to(self.device)
        self.game_df = df
        
        self.title_iter = -1
        self.out_titles = None
        self.best_title = None
        self.description = None
        self.nlp = spacy.load("en_core_web_md")


    def candidate_generator(self, description):
        text =  "headline: " + description

        encoding = self.tokenizer.encode_plus(text, return_tensors = "pt")
        input_ids = encoding["input_ids"].to(self.device)
        attention_masks = encoding["attention_mask"].to(self.device)

        candidates = []

        beam_outputs = self.model.generate(
            input_ids = input_ids,
            attention_mask = attention_masks,
            max_length = 64,
            num_beams = 16,
            num_beam_groups=4,
            num_return_sequences=8,
            diversity_penalty=.1,
            repetition_penalty=.9,
            early_stopping = True)

        for result in beam_outputs:
            res = self.tokenizer.decode(result).replace('<pad> ','').replace('</s>','').replace('<pad>','')
            candidates.append(res)
        
        return candidates, description
    
    def candidate_score(self,candidates,ex_check=None):
        
        
        if ex_check != None:
            pat = re.compile("((?:" + "|".join(map(re.escape, candidates[0]+[cand.upper() for cand in candidates[0]])) + "|" + "|".join(ex_check) +"))")
            desc = re.sub(pat, "__", candidates[1])
        else:
            pat = re.compile("((?:" + "|".join(map(re.escape, candidates[0]+[cand.upper() for cand in candidates[0]])) + "))")
            desc = re.sub(pat, "__", candidates[1])

        
        if re.search(re.compile(re.escape("__")), desc):
            reg = re.compile("("+"|".join(ex_check) + ")")
            hold = candidates[0]
            gen_desc = re.sub(re.compile(re.escape("__")),"",desc)
            candidates = self.candidate_generator(gen_desc)
            next = [cand for cand in candidates[0]+hold if not reg.search(cand)]
            candidates = (next, desc)

        #check for existing games and duplicates
        #transform function from https://stackoverflow.com/questions/42165779/python-how-to-remove-duplicate-valuescase-insensitive-from-a-list-with-same-o
        def transform(L):
            S = set(L)
            return [item.title() for item in L if item.lower() not in S and not S.add(item.lower())]


        clean_cand_step = list(set([game[0] for game in list(zip(candidates[0],[len(self.game_df[self.game_df.name.isin([x])]) for x in candidates[0]])) if game[1]==0]))
        clean_cand_step = transform(clean_cand_step)

        clean_cand_step = [re.sub(re.compile("(?<=[ ])And(?=[ ])"),'and',
                                  re.sub(re.compile('(?<=\S) (([(]|\b)[Ss]econd [Ee]dition([)]|\b)|[Ss]econd [Ee]dition|2[Nn][Dd] [Ee]dition|([(]|\b)[Tt]hird [Ee]dition([)]|\b)|3[Rr][Dd] [Ee]dition)|["]Second Edition["]'),"",
                                  re.sub(re.compile("(?<=[a-z])'S"),"'s",
                                  re.sub(re.compile("(?<=[ ])Of(?=[ ])"),"of",x)))) 
                                  for x in clean_cand_step]

        
        clean_cand = []
        for cand in clean_cand_step:
            try: 
                inter = cand.split(":")
                if inter[0].lower()==inter[1].lower():
                    clean_cand.append(inter[0])
                else:
                    clean_cand.append(cand)
            except:
                clean_cand.append(cand)

        #text processing
        token_cand = doc_text_preprocessing(pd.Series(clean_cand))
        token_art = doc_text_preprocessing(pd.Series([candidates[1]]))
        sim = [self.nlp(title) for title in [" ".join(title) for title in token_cand]]
        doc = self.nlp(" ".join(token_art[0]))

        #scores cosine similarity between generated titles and body text, if the word is unknown (i.e. generator knows it but spacy doesn't)
        #it assigns a random probability to populate

        scores = [x if x !=0 else random.uniform(.3, .7) for x in [tok.similarity(doc) for tok in sim]]
        
        out_titles = sorted(list(zip(clean_cand,scores)),key=itemgetter(1),reverse=True)
   
        pat = re.compile("(?<=[!.?])(?=[^\s])")
        pat2 = re.compile("([Ff]rom the [Pp]ublisher[: ]|[Ff]rom the [Dd]esigner[: ]|[Gg]ame [Dd]escription)")
        pat3 = re.compile(": [Tt]he [Gg]ame: [Tt]he [Gg]ame|: [Tt]he [Gg]ame")
        pat4 = re.compile("[Tt]he __")
        pat5 = re.compile("__ [Gg]ame")
        pat6 = re.compile("[Tt]he [Gg]ame [Oo]f __")
        
        desc = re.sub(pat," ",candidates[1])   
        desc = re.sub(pat2,"",desc)
        desc = re.sub(pat3,"",desc)
        desc = re.sub(pat4,"__",desc)
        desc = re.sub(pat5,"__",desc)
        desc = re.sub(pat6,"__",desc)

        return {'text':desc,'titles':out_titles}