File size: 5,631 Bytes
a79e4c0
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
#Alternative input manager for description generator
class input_manager:
     #initialize key dictionary from vector data frame and set community top N
    def __init__(self,key_df, slim_df, search_tokens, top_n=10):
        self.key_df = key_df
        self.slim_df = slim_df
        self.search_tokens = search_tokens
        self.key = dict(zip(list(key_df.columns),np.zeros(len(key_df.columns))))
        self.top_n = top_n
        self.nlp = spacy.load("en_core_web_md")
  #translate input text to vector
    def set_input(self,input_cats):
    
        #need setup to apply correct group tag to values
        #separate known/unknown features
        k_flags = [cat for cat in input_cats if cat in list(self.key.keys())]
        unk_flags = [cat for cat in input_cats if cat not in list(self.key.keys())]
        
        #process within feature class similarity for each unknown input
        if len(unk_flags)>0:
            outs = []

        for word in unk_flags:
            if re.match(r"game_type_",word):
                tok = self.nlp(word.split("_")[-1])
                mtch = max([(key,key.similarity(tok)) for key in self.search_tokens[0]],key=itemgetter(1))
            #if no known match is found (model doesn't recognize input word), we're going to discard - other solutions performance prohibitive
            if mtch[1]>0:
                outs.append("game_type_"+mtch[0])
            elif re.match(r"mechanic_",word):
                tok = self.nlp(word.split("_")[-1])
                mtch = max([(key,key.similarity(tok)) for key in self.search_tokens[1]],key=itemgetter(1))
            if mtch[1]>0:
                outs.append("mechanic_"+mtch[0])
            elif re.match(r"category_",word):
                tok = self.nlp(word.split("_")[-1])
                mtch=max([(key,key.similarity(tok)) for key in self.search_tokens[2]],key=itemgetter(1))
            if mtch[1]>0:
                outs.append("category_"+mtch[0])
            elif re.match(r"family_",word):
                tok = self.nlp(word.split("_")[-1])
                mtch=max([(key,key.similarity(tok)) for key in self.search_tokens[3]],key=itemgetter(1))
            if mtch[1]>0:
                outs.append("family_"+str(mtch[0]))
        
        #if unks are processed, rejoin nearest match to known.
        k_flags = list(set(k_flags+outs))
        
        #preserve global key and ouput copy w/input keys activated to 1
        d = self.key.copy()
        for cat in k_flags:
            d[cat] = 1.0
        return d
    
    def input_parser(self,in_vec):
        #extracting keys from processed vector
        ks = [k for k,v in in_vec.items() if v == 1]

        #finding raw "total" match score - how many of the how input columns are hot in each existing vector
        inter = self.key_df[ks].sum(axis=1)

        #performing operation on each df seems to be slightly quicker than transforming the df here - may refactor though

        #dropping any row without 3 matches (minimum match check)
        cand_vec = self.key_df.iloc[list(inter[inter>=3].index)]
        #if parsing returns less ranked matches than specificed top n, reduce threshold to 1 match and check again
        if len(cand_vec) < self.top_n:
            cand_vec = self.key_df.iloc[list(inter[inter>=1].index)]

        cand_slim = self.slim_df.iloc[list(inter[inter>=3].index)]
        if len(cand_slim) < self.top_n:
            cand_slim = self.key_df.iloc[list(inter[inter>=1].index)]

        return ks,cand_slim,in_vec.values()

  #calculating per community vector pairwise jaccard similarity to input split by feature class
    def ret_jaccard(self,in_vec,t_vec):
        gt_score = sklearn.metrics.jaccard_score(in_vec[1:9],t_vec[1:9],zero_division=0)
        cat_score = sklearn.metrics.jaccard_score(in_vec[192:276],t_vec[192:276],zero_division=0)
        mech_score = sklearn.metrics.jaccard_score(in_vec[9:192],t_vec[9:192],zero_division=0)
        fam_score = sklearn.metrics.jaccard_score(in_vec[276:3901],t_vec[276:3901],zero_division=0)
        if in_vec[0] == t_vec[0]:
            coop_score = 1
        else:
            coop_score = 0

        #initial weighting treats all feature classes as equal - looking into updating this as a feedback mechanism
        return np.mean([gt_score,cat_score,mech_score,fam_score,coop_score])

  #function to actually return community neighbors
    def n_neighbors(self,in_data):
        #applies jaccard func to each row using vectors and maps to "full" df w/text
        slim, vec, in_vec = in_data
        vec['score']=vec.apply(lambda x: self.ret_jaccard(in_vec,x),raw=True,axis=1)
        slim['score']=vec['score']

        #converts to rank - this avoids splitting equal scoring groups inappropriately
        slim['rank'] = slim['score'].rank(ascending=False)
        return slim[slim['rank']<self.top_n].sort_values(by=['rank'])
    
    def query_score(self,outframe, gen_text):
        #requires text processing function, nearest neighbor community dataframe, and piece of generated text
        query = doc_text_preprocessing(pd.Series(gen_text))
        desc_tokens = pd.concat([outframe['cleaned_descriptions'],pd.Series(query)])
        desc_dict = corpora.Dictionary()
        desc_corpus = [desc_dict.doc2bow(doc, allow_update=True) for doc in desc_tokens]
        temp_index = get_tmpfile("index")
        index = similarities.Similarity(temp_index, desc_corpus, num_features=len(desc_dict.token2id))

        sim_stack = []
        for sims in index:
            sim_stack.append(sims)

        return (gen_text,np.mean(np.multiply(out['score'],sim_stack[-1][:-1])))