File size: 4,985 Bytes
b3f3132
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
from transformers import  AutoModel, AutoTokenizer
import pandas as pd
import torch
from torch.utils.data import Dataset
import logging
from tqdm import tqdm
from torch.utils.data import DataLoader
import torch.nn.functional as F
import pickle
import string
from abc import abstractmethod
import json


class AbstractMoviesRanker:
    """Abstract class for ranking items"""
    def __init__(self, df, index_matrix, score_name = "score"):
        self.df = df
        self.ids = self.df.index.values
        self.index_matrix = index_matrix
        self.score_name = score_name

    @abstractmethod
    def encode_query(self, query):
        pass
    
    def get_scores(self, encoded_query):
        return torch.mm(encoded_query, self.index_matrix.transpose(0,1))[0].tolist()
    
    def get_top_ids(self, scores, topn=6):
        ids_scores_pairs = list(zip(self.ids.tolist(), scores))
        ids_scores_pairs = sorted(ids_scores_pairs, key = lambda x:x[1], reverse = True)
        sorted_ids = [v[0] for v in ids_scores_pairs]
        sorted_scores = [v[1] for v in ids_scores_pairs]
        sorted_df = self.df.loc[sorted_ids[:topn], :]
        sorted_df.loc[:,self.score_name] = sorted_scores[:topn]
        return sorted_df

    def run_query(self, query, topn=6):
        encoded_query = self.encode_query(query)
        scores = self.get_scores(encoded_query)
        return self.get_top_ids(scores, topn)

    depunctuate = staticmethod(lambda x: x.translate(str.maketrans('','',string.punctuation)))

class SparseTfIdfRanker(AbstractMoviesRanker):
    """Sparse Ranking via TF iDF"""
    def __init__(self, df, index_matrix, vectorizer_path):
        super(SparseTfIdfRanker, self).__init__(df, index_matrix, score_name = 'tfidf-score')
        self.vectorizer = pickle.load(open(vectorizer_path, 'rb'))
        self.index_matrix = self.index_matrix.to_dense() ##For dot products

    def encode_query(self, query):
        encoded_query = torch.tensor(self.vectorizer.transform([self.depunctuate(query)]).todense(), dtype = torch.float32)
        return F.normalize(encoded_query, p=2)


class BertRanker(AbstractMoviesRanker):
    """Dense Ranking with embedding matrix"""
    def __init__(self, df, index_matrix, modelpath):
        super(BertRanker, self).__init__(df, index_matrix, score_name = "bert-score")
        self.tokenizer = AutoTokenizer.from_pretrained(modelpath)
        self.model = AutoModel.from_pretrained(modelpath)

    def encode_query(self, query):
        tok_q = self.tokenizer(query, return_tensors="pt", padding="max_length", max_length = 128, truncation=True)
        o = self.model(**tok_q)
        encoded_query = self.mean_pooling(o, tok_q['attention_mask'])
        return F.normalize(encoded_query, p=2)

    @staticmethod
    def mean_pooling(model_output, attention_mask):
        token_embeddings = model_output.last_hidden_state #First element of model_output contains all token embeddings
        input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
        return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9)



class SparseDenseMoviesRanker():
    """Sparse Ranking via TF iDF, filtering a first rank, then dense ranking on these items"""
    def __init__(self, df, modelpath, bert_index, sparse_index, vectorizer_path):
        self.df =df 
        self.ids = self.df.index.values
        self.tfidf_engine = SparseTfIdfRanker(df, sparse_index, vectorizer_path)
        self.modelpath = modelpath
        self.bert_index = bert_index

    def run_query(self, query, topn=6, first_ranking=1000):
        tfidf_sorted_frame = self.tfidf_engine.run_query(query, topn=first_ranking)
        firstranking_index = self.bert_index[tfidf_sorted_frame.index.values]
        self.bert_engine = BertRanker(tfidf_sorted_frame, firstranking_index, self.modelpath)
        bert_sorted_frame = self.bert_engine.run_query(query, topn=topn)
        return bert_sorted_frame

    @classmethod
    def from_json_config(cls, jsonfile):
        with open(jsonfile) as fp:
            conf = json.loads(fp.read())

        ##Load data for ranking
        df = pd.read_pickle(conf['dataframe'])

        ##Load indices, e.g. embeddings and encoding utilities
        bert_index = torch.load(conf['bert_index'])
        sparse_index = torch.load(conf['sparse_index'])
        vectorizer_path = conf['vectorizer_path']
        modelpath = conf['modelpath']

        ##Conf for first ranking
        firstranking = conf.get('firstranking', 100)
        ranker = cls(df, modelpath, bert_index, sparse_index, vectorizer_path)
        return ranker 


if __name__=='__main__':
    
    engine = SparseDenseMoviesRanker.from_json_config('conf.json')

    for query in ["une histoire de pirates et de chasse au trésor", "une histoire de gangsters avec de l'argent"]:
        print(query)
        final_df = engine.run_query(query)
        print(final_df.head())