Stanford-TH commited on
Commit
7ca7ec8
1 Parent(s): f8714d8

Upload 7 files

Browse files
.gitattributes CHANGED
@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ Themes_And_Concepts.csv filter=lfs diff=lfs merge=lfs -text
ModelSimilarity.py ADDED
@@ -0,0 +1,166 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pandas as pd
2
+ import torch
3
+ import numpy as np
4
+ from ast import literal_eval
5
+ from transformers import AutoModel, AutoTokenizer
6
+ from torch.utils.data import TensorDataset
7
+
8
+ class MovieSimilarityModel:
9
+ def __init__(self, csv_file_path, model_name='sentence-transformers/all-mpnet-base-v2', max_length=32):
10
+ self.dataset = pd.read_csv(csv_file_path)
11
+ self.columns_to_eval = ['genres', 'Keywords', 'Negative_Reviews', 'Positive_Reviews', 'positive_relationship', 'negative_relationship', 'Themes', 'Concepts']
12
+ self._preprocess_data()
13
+
14
+
15
+ self.tokenizer = AutoTokenizer.from_pretrained(model_name)
16
+ self.model = AutoModel.from_pretrained(model_name) # Push model to GPU
17
+ self.max_length = max_length
18
+
19
+ self.sentences = (self.dataset['Themes'] + self.dataset['Concepts']).to_list()
20
+
21
+ self.genres = self.dataset['genres'].to_list()
22
+
23
+ self.data = self.SentenceDataset(self.sentences, self.genres, self.tokenizer, max_length=self.max_length)
24
+
25
+ def _preprocess_data(self):
26
+ for col in self.columns_to_eval:
27
+ self.dataset[col] = self.dataset[col].apply(self._safe_literal_eval)
28
+
29
+ @staticmethod
30
+ def _safe_literal_eval(value):
31
+ try:
32
+ return literal_eval(value)
33
+ except (ValueError, SyntaxError):
34
+ return value
35
+
36
+ class SentenceDataset(TensorDataset):
37
+ def __init__(self, sentences, genres, tokenizer, max_length=32):
38
+ self.sentences = sentences
39
+ self.genres = genres
40
+ self.tokenizer = tokenizer
41
+ self.max_length = max_length
42
+ self.encoded_sentences = [
43
+ [tokenizer.encode_plus(word_list, padding='max_length', return_attention_mask=True, truncation=True, max_length=max_length, is_split_into_words=False)
44
+ for word_list in word_lists]
45
+ for word_lists in sentences
46
+ ]
47
+ self.genres_categories = [
48
+ tokenizer.encode_plus(word_lists, padding='max_length', return_attention_mask=True, return_tensors='pt', truncation=True, max_length=32, is_split_into_words=True)
49
+ for word_lists in genres
50
+ ]
51
+
52
+ def __len__(self):
53
+ return len(self.sentences)
54
+
55
+ def __getitem__(self, idx):
56
+ sentences = self.encoded_sentences[idx]
57
+ genres = self.genres_categories[idx]
58
+ input_ids = torch.tensor([d['input_ids'] for d in sentences])
59
+ attention_masks = torch.tensor([d['attention_mask'] for d in sentences])
60
+ return [{'input_ids': input_ids, 'attention_mask': attention_masks}, genres]
61
+
62
+ @staticmethod
63
+ def mean_pooling(model_output, attention_mask):
64
+ token_embeddings = model_output[0] # First element contains all token embeddings
65
+ input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
66
+ return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9)
67
+
68
+ def compute_sentence_embeddings(self):
69
+ sentence_embedded_data = []
70
+ genres_embedded_data = []
71
+ for i in range(len(self.data)):
72
+ with torch.no_grad():
73
+ embeddings_themes_concepts = self.model(**self.data[i][0])
74
+ embeddings_genres = self.model(**self.data[i][1])
75
+ sentence_embeddings = self.mean_pooling(embeddings_themes_concepts, self.data[i][0]['attention_mask'])
76
+ genres_embeddings = self.mean_pooling(embeddings_genres, self.data[i][1]['attention_mask'])
77
+ sentence_embedded_data.append(sentence_embeddings) # Keep as tensors
78
+ genres_embedded_data.append(genres_embeddings) # Keep as tensors
79
+ return sentence_embedded_data, genres_embedded_data
80
+
81
+ @staticmethod
82
+ def similarity_matrix(a, b, eps=1e-9):
83
+ a_n, b_n = a.norm(dim=1)[:, None], b.norm(dim=1)[:, None]
84
+ a_norm = a / torch.clamp(a_n, min=eps)
85
+ b_norm = b / torch.clamp(b_n, min=eps)
86
+ a_norm, b_norm, combinations = MovieSimilarityModel.tensor_pad(a_norm, b_norm)
87
+ sim_mt = torch.mm(a_norm, b_norm.transpose(0, 1))
88
+ return sim_mt, combinations
89
+
90
+ @staticmethod
91
+ def tensor_pad(a, b):
92
+ x_a, y_a = a.shape
93
+ x_b, y_b = b.shape
94
+ combinations = x_a * x_b
95
+
96
+ if x_a < x_b:
97
+ padding = (0, 0, 0, x_b - x_a)
98
+ a_padded = torch.nn.functional.pad(a, padding)
99
+ return a_padded, b, combinations
100
+ elif x_b < x_a:
101
+ padding = (0, 0, 0, x_a - x_b)
102
+ b_padded = torch.nn.functional.pad(b, padding)
103
+ return b_padded, a, combinations
104
+ else:
105
+ return a, b, combinations
106
+
107
+ def compute_similarity_matrix(self):
108
+ Y = []
109
+ for sentence_embed_x, genres_embed_x in zip(self.sentence_embedded_data, self.genres_embedded_data):
110
+ X = []
111
+ for sentence_embed_y, genres_embed_y in zip(self.sentence_embedded_data, self.genres_embedded_data):
112
+ sim_mt, combinations = self.similarity_matrix(sentence_embed_x, sentence_embed_y)
113
+ thc_similarity = (sim_mt.sum() / combinations)
114
+ genres_similarity = torch.nn.functional.cosine_similarity(genres_embed_x, genres_embed_y)
115
+ final_similarity = (0.70 * thc_similarity) + (0.30 * genres_similarity)
116
+ X.append(final_similarity.item())
117
+ Y.append(X)
118
+ return Y
119
+
120
+ def encode_input_movie(self, input_themes, input_genres):
121
+ input_data = self.SentenceDataset(input_themes, input_genres, self.tokenizer, max_length=self.max_length)
122
+
123
+ with torch.no_grad():
124
+ embeddings_themes_concepts = self.model(**input_data[0][0])
125
+ embeddings_genres = self.model(**input_data[0][1])
126
+ input_sentence_embeddings = self.mean_pooling(embeddings_themes_concepts, input_data[0][0]['attention_mask'])
127
+ input_genres_embeddings = self.mean_pooling(embeddings_genres, input_data[0][1]['attention_mask'])
128
+
129
+ return input_sentence_embeddings, input_genres_embeddings
130
+
131
+ def compare_input_vs_dataset(self, input_sentence_embeddings, input_genres_embeddings):
132
+ similarity_scores = []
133
+ for dataset_sentence_embeddings, dataset_genres_embeddings in zip(self.sentence_embedded_data, self.genres_embedded_data):
134
+ sim_mt, combinations = self.similarity_matrix(input_sentence_embeddings, dataset_sentence_embeddings)
135
+ thc_similarity = (sim_mt.sum() / combinations)
136
+ genres_similarity = torch.nn.functional.cosine_similarity(input_genres_embeddings, dataset_genres_embeddings)
137
+ final_similarity = (0.70 * thc_similarity) + (0.30 * genres_similarity)
138
+ similarity_scores.append(final_similarity.item())
139
+
140
+ # Sort by similarity scores and get top 5
141
+ top_5_indices = np.argsort(similarity_scores)[-5:][::-1]
142
+ top_5_scores = np.array(similarity_scores)[top_5_indices]
143
+ top_5_movies = self.dataset['Movie'].iloc[top_5_indices].values
144
+
145
+ # Return the top 5 movies with their similarity scores
146
+ return pd.DataFrame(list(zip(top_5_movies, top_5_scores)), columns=['Movie', 'Similarity Score'])
147
+
148
+ def load_tensor_list(self):
149
+ # Load precomputed embeddings and map them to CUDA
150
+ self.sentence_embedded_data = torch.load('./tac_embeddings.pt')
151
+ self.genres_embedded_data = torch.load('./genres_embeddings.pt')
152
+
153
+
154
+ def save_embeddings_to_dataframe(self):
155
+ # Compute embeddings
156
+ sentence_embedded_data, genres_embedded_data = self.compute_sentence_embeddings()
157
+
158
+ # Convert to dictionary for easier saving
159
+ embeddings_dict = {
160
+ 'Movie': self.dataset['Movie'],
161
+ 'Sentence_Embeddings': sentence_embedded_data,
162
+ 'Genres_Embeddings': genres_embedded_data
163
+ }
164
+
165
+ return embeddings_dict
166
+
README.md CHANGED
@@ -1,13 +1,13 @@
1
- ---
2
- title: Similarity Model
3
- emoji: 🦀
4
- colorFrom: yellow
5
- colorTo: green
6
- sdk: gradio
7
- sdk_version: 4.41.0
8
- app_file: app.py
9
- pinned: false
10
- license: mit
11
- ---
12
-
13
- Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
1
+ ---
2
+ title: Similarity Model
3
+ emoji: 🦀
4
+ colorFrom: yellow
5
+ colorTo: green
6
+ sdk: gradio
7
+ sdk_version: 4.41.0
8
+ app_file: app.py
9
+ pinned: false
10
+ license: mit
11
+ ---
12
+
13
+ Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
Themes_And_Concepts.csv ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:538e953c7030591110f38921f4a2252a5e61400a6daad84f1896159b2b9b1300
3
+ size 78347252
app.py ADDED
@@ -0,0 +1,31 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import torch
3
+ import pandas as pd
4
+ from ModelSimilarity import MovieSimilarityModel
5
+
6
+ # Assume the MovieSimilarityModel class is already defined above
7
+ csv_file_path = './Themes_And_Concepts.csv'
8
+ movie_similarity_model = MovieSimilarityModel(csv_file_path)
9
+
10
+ # Load precomputed embeddings if available
11
+ movie_similarity_model.load_tensor_list()
12
+
13
+ # Define a function to wrap the comparison
14
+ def get_top_5_similar_movies(input_themes, input_genres):
15
+ # Directly use the input arrays
16
+ input_sentence_embeddings, input_genres_embeddings = movie_similarity_model.encode_input_movie(input_themes, input_genres)
17
+
18
+ # Get top 5 similar movies
19
+ top_5_similar_movies = movie_similarity_model.compare_input_vs_dataset(input_sentence_embeddings, input_genres_embeddings)
20
+
21
+ return top_5_similar_movies
22
+
23
+ # Define Gradio interface
24
+ input_themes = gr.List(headers=["Themes"], label="Input Movie Themes (array of strings)")
25
+ input_genres = gr.List(headers=["Genres"], label="Input Movie Genres (array of strings)")
26
+ output = gr.Dataframe(type="pandas", label="Top 5 Similar Movies")
27
+
28
+ interface = gr.Interface(fn=get_top_5_similar_movies, inputs=[input_themes, input_genres], outputs=output)
29
+
30
+ # Launch the Gradio interface
31
+ interface.launch()
genres_embeddings.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c70d517b5cd91db2a14677e2c2162d14ee13a5f1cdbf0c2e996f411b9752078b
3
+ size 2881850
requirements.txt ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ torch == 2.4.0
2
+ transformers == 4.44.0
3
+ numpy == 2.0.1
4
+ pandas == 2.2.2
tac_embeddings.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:09d628a47bbc59f74def388861a49e27c9401bb67d9b66bfd1daaee7d9d006ad
3
+ size 22871834