Spaces:
Running
Running
Stanford-TH
commited on
Commit
•
7ca7ec8
1
Parent(s):
f8714d8
Upload 7 files
Browse files- .gitattributes +1 -0
- ModelSimilarity.py +166 -0
- README.md +13 -13
- Themes_And_Concepts.csv +3 -0
- app.py +31 -0
- genres_embeddings.pt +3 -0
- requirements.txt +4 -0
- tac_embeddings.pt +3 -0
.gitattributes
CHANGED
@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
|
33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
|
|
|
33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
36 |
+
Themes_And_Concepts.csv filter=lfs diff=lfs merge=lfs -text
|
ModelSimilarity.py
ADDED
@@ -0,0 +1,166 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import pandas as pd
|
2 |
+
import torch
|
3 |
+
import numpy as np
|
4 |
+
from ast import literal_eval
|
5 |
+
from transformers import AutoModel, AutoTokenizer
|
6 |
+
from torch.utils.data import TensorDataset
|
7 |
+
|
8 |
+
class MovieSimilarityModel:
|
9 |
+
def __init__(self, csv_file_path, model_name='sentence-transformers/all-mpnet-base-v2', max_length=32):
|
10 |
+
self.dataset = pd.read_csv(csv_file_path)
|
11 |
+
self.columns_to_eval = ['genres', 'Keywords', 'Negative_Reviews', 'Positive_Reviews', 'positive_relationship', 'negative_relationship', 'Themes', 'Concepts']
|
12 |
+
self._preprocess_data()
|
13 |
+
|
14 |
+
|
15 |
+
self.tokenizer = AutoTokenizer.from_pretrained(model_name)
|
16 |
+
self.model = AutoModel.from_pretrained(model_name) # Push model to GPU
|
17 |
+
self.max_length = max_length
|
18 |
+
|
19 |
+
self.sentences = (self.dataset['Themes'] + self.dataset['Concepts']).to_list()
|
20 |
+
|
21 |
+
self.genres = self.dataset['genres'].to_list()
|
22 |
+
|
23 |
+
self.data = self.SentenceDataset(self.sentences, self.genres, self.tokenizer, max_length=self.max_length)
|
24 |
+
|
25 |
+
def _preprocess_data(self):
|
26 |
+
for col in self.columns_to_eval:
|
27 |
+
self.dataset[col] = self.dataset[col].apply(self._safe_literal_eval)
|
28 |
+
|
29 |
+
@staticmethod
|
30 |
+
def _safe_literal_eval(value):
|
31 |
+
try:
|
32 |
+
return literal_eval(value)
|
33 |
+
except (ValueError, SyntaxError):
|
34 |
+
return value
|
35 |
+
|
36 |
+
class SentenceDataset(TensorDataset):
|
37 |
+
def __init__(self, sentences, genres, tokenizer, max_length=32):
|
38 |
+
self.sentences = sentences
|
39 |
+
self.genres = genres
|
40 |
+
self.tokenizer = tokenizer
|
41 |
+
self.max_length = max_length
|
42 |
+
self.encoded_sentences = [
|
43 |
+
[tokenizer.encode_plus(word_list, padding='max_length', return_attention_mask=True, truncation=True, max_length=max_length, is_split_into_words=False)
|
44 |
+
for word_list in word_lists]
|
45 |
+
for word_lists in sentences
|
46 |
+
]
|
47 |
+
self.genres_categories = [
|
48 |
+
tokenizer.encode_plus(word_lists, padding='max_length', return_attention_mask=True, return_tensors='pt', truncation=True, max_length=32, is_split_into_words=True)
|
49 |
+
for word_lists in genres
|
50 |
+
]
|
51 |
+
|
52 |
+
def __len__(self):
|
53 |
+
return len(self.sentences)
|
54 |
+
|
55 |
+
def __getitem__(self, idx):
|
56 |
+
sentences = self.encoded_sentences[idx]
|
57 |
+
genres = self.genres_categories[idx]
|
58 |
+
input_ids = torch.tensor([d['input_ids'] for d in sentences])
|
59 |
+
attention_masks = torch.tensor([d['attention_mask'] for d in sentences])
|
60 |
+
return [{'input_ids': input_ids, 'attention_mask': attention_masks}, genres]
|
61 |
+
|
62 |
+
@staticmethod
|
63 |
+
def mean_pooling(model_output, attention_mask):
|
64 |
+
token_embeddings = model_output[0] # First element contains all token embeddings
|
65 |
+
input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
|
66 |
+
return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9)
|
67 |
+
|
68 |
+
def compute_sentence_embeddings(self):
|
69 |
+
sentence_embedded_data = []
|
70 |
+
genres_embedded_data = []
|
71 |
+
for i in range(len(self.data)):
|
72 |
+
with torch.no_grad():
|
73 |
+
embeddings_themes_concepts = self.model(**self.data[i][0])
|
74 |
+
embeddings_genres = self.model(**self.data[i][1])
|
75 |
+
sentence_embeddings = self.mean_pooling(embeddings_themes_concepts, self.data[i][0]['attention_mask'])
|
76 |
+
genres_embeddings = self.mean_pooling(embeddings_genres, self.data[i][1]['attention_mask'])
|
77 |
+
sentence_embedded_data.append(sentence_embeddings) # Keep as tensors
|
78 |
+
genres_embedded_data.append(genres_embeddings) # Keep as tensors
|
79 |
+
return sentence_embedded_data, genres_embedded_data
|
80 |
+
|
81 |
+
@staticmethod
|
82 |
+
def similarity_matrix(a, b, eps=1e-9):
|
83 |
+
a_n, b_n = a.norm(dim=1)[:, None], b.norm(dim=1)[:, None]
|
84 |
+
a_norm = a / torch.clamp(a_n, min=eps)
|
85 |
+
b_norm = b / torch.clamp(b_n, min=eps)
|
86 |
+
a_norm, b_norm, combinations = MovieSimilarityModel.tensor_pad(a_norm, b_norm)
|
87 |
+
sim_mt = torch.mm(a_norm, b_norm.transpose(0, 1))
|
88 |
+
return sim_mt, combinations
|
89 |
+
|
90 |
+
@staticmethod
|
91 |
+
def tensor_pad(a, b):
|
92 |
+
x_a, y_a = a.shape
|
93 |
+
x_b, y_b = b.shape
|
94 |
+
combinations = x_a * x_b
|
95 |
+
|
96 |
+
if x_a < x_b:
|
97 |
+
padding = (0, 0, 0, x_b - x_a)
|
98 |
+
a_padded = torch.nn.functional.pad(a, padding)
|
99 |
+
return a_padded, b, combinations
|
100 |
+
elif x_b < x_a:
|
101 |
+
padding = (0, 0, 0, x_a - x_b)
|
102 |
+
b_padded = torch.nn.functional.pad(b, padding)
|
103 |
+
return b_padded, a, combinations
|
104 |
+
else:
|
105 |
+
return a, b, combinations
|
106 |
+
|
107 |
+
def compute_similarity_matrix(self):
|
108 |
+
Y = []
|
109 |
+
for sentence_embed_x, genres_embed_x in zip(self.sentence_embedded_data, self.genres_embedded_data):
|
110 |
+
X = []
|
111 |
+
for sentence_embed_y, genres_embed_y in zip(self.sentence_embedded_data, self.genres_embedded_data):
|
112 |
+
sim_mt, combinations = self.similarity_matrix(sentence_embed_x, sentence_embed_y)
|
113 |
+
thc_similarity = (sim_mt.sum() / combinations)
|
114 |
+
genres_similarity = torch.nn.functional.cosine_similarity(genres_embed_x, genres_embed_y)
|
115 |
+
final_similarity = (0.70 * thc_similarity) + (0.30 * genres_similarity)
|
116 |
+
X.append(final_similarity.item())
|
117 |
+
Y.append(X)
|
118 |
+
return Y
|
119 |
+
|
120 |
+
def encode_input_movie(self, input_themes, input_genres):
|
121 |
+
input_data = self.SentenceDataset(input_themes, input_genres, self.tokenizer, max_length=self.max_length)
|
122 |
+
|
123 |
+
with torch.no_grad():
|
124 |
+
embeddings_themes_concepts = self.model(**input_data[0][0])
|
125 |
+
embeddings_genres = self.model(**input_data[0][1])
|
126 |
+
input_sentence_embeddings = self.mean_pooling(embeddings_themes_concepts, input_data[0][0]['attention_mask'])
|
127 |
+
input_genres_embeddings = self.mean_pooling(embeddings_genres, input_data[0][1]['attention_mask'])
|
128 |
+
|
129 |
+
return input_sentence_embeddings, input_genres_embeddings
|
130 |
+
|
131 |
+
def compare_input_vs_dataset(self, input_sentence_embeddings, input_genres_embeddings):
|
132 |
+
similarity_scores = []
|
133 |
+
for dataset_sentence_embeddings, dataset_genres_embeddings in zip(self.sentence_embedded_data, self.genres_embedded_data):
|
134 |
+
sim_mt, combinations = self.similarity_matrix(input_sentence_embeddings, dataset_sentence_embeddings)
|
135 |
+
thc_similarity = (sim_mt.sum() / combinations)
|
136 |
+
genres_similarity = torch.nn.functional.cosine_similarity(input_genres_embeddings, dataset_genres_embeddings)
|
137 |
+
final_similarity = (0.70 * thc_similarity) + (0.30 * genres_similarity)
|
138 |
+
similarity_scores.append(final_similarity.item())
|
139 |
+
|
140 |
+
# Sort by similarity scores and get top 5
|
141 |
+
top_5_indices = np.argsort(similarity_scores)[-5:][::-1]
|
142 |
+
top_5_scores = np.array(similarity_scores)[top_5_indices]
|
143 |
+
top_5_movies = self.dataset['Movie'].iloc[top_5_indices].values
|
144 |
+
|
145 |
+
# Return the top 5 movies with their similarity scores
|
146 |
+
return pd.DataFrame(list(zip(top_5_movies, top_5_scores)), columns=['Movie', 'Similarity Score'])
|
147 |
+
|
148 |
+
def load_tensor_list(self):
|
149 |
+
# Load precomputed embeddings and map them to CUDA
|
150 |
+
self.sentence_embedded_data = torch.load('./tac_embeddings.pt')
|
151 |
+
self.genres_embedded_data = torch.load('./genres_embeddings.pt')
|
152 |
+
|
153 |
+
|
154 |
+
def save_embeddings_to_dataframe(self):
|
155 |
+
# Compute embeddings
|
156 |
+
sentence_embedded_data, genres_embedded_data = self.compute_sentence_embeddings()
|
157 |
+
|
158 |
+
# Convert to dictionary for easier saving
|
159 |
+
embeddings_dict = {
|
160 |
+
'Movie': self.dataset['Movie'],
|
161 |
+
'Sentence_Embeddings': sentence_embedded_data,
|
162 |
+
'Genres_Embeddings': genres_embedded_data
|
163 |
+
}
|
164 |
+
|
165 |
+
return embeddings_dict
|
166 |
+
|
README.md
CHANGED
@@ -1,13 +1,13 @@
|
|
1 |
-
---
|
2 |
-
title: Similarity Model
|
3 |
-
emoji: 🦀
|
4 |
-
colorFrom: yellow
|
5 |
-
colorTo: green
|
6 |
-
sdk: gradio
|
7 |
-
sdk_version: 4.41.0
|
8 |
-
app_file: app.py
|
9 |
-
pinned: false
|
10 |
-
license: mit
|
11 |
-
---
|
12 |
-
|
13 |
-
Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
|
|
|
1 |
+
---
|
2 |
+
title: Similarity Model
|
3 |
+
emoji: 🦀
|
4 |
+
colorFrom: yellow
|
5 |
+
colorTo: green
|
6 |
+
sdk: gradio
|
7 |
+
sdk_version: 4.41.0
|
8 |
+
app_file: app.py
|
9 |
+
pinned: false
|
10 |
+
license: mit
|
11 |
+
---
|
12 |
+
|
13 |
+
Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
|
Themes_And_Concepts.csv
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:538e953c7030591110f38921f4a2252a5e61400a6daad84f1896159b2b9b1300
|
3 |
+
size 78347252
|
app.py
ADDED
@@ -0,0 +1,31 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import gradio as gr
|
2 |
+
import torch
|
3 |
+
import pandas as pd
|
4 |
+
from ModelSimilarity import MovieSimilarityModel
|
5 |
+
|
6 |
+
# Assume the MovieSimilarityModel class is already defined above
|
7 |
+
csv_file_path = './Themes_And_Concepts.csv'
|
8 |
+
movie_similarity_model = MovieSimilarityModel(csv_file_path)
|
9 |
+
|
10 |
+
# Load precomputed embeddings if available
|
11 |
+
movie_similarity_model.load_tensor_list()
|
12 |
+
|
13 |
+
# Define a function to wrap the comparison
|
14 |
+
def get_top_5_similar_movies(input_themes, input_genres):
|
15 |
+
# Directly use the input arrays
|
16 |
+
input_sentence_embeddings, input_genres_embeddings = movie_similarity_model.encode_input_movie(input_themes, input_genres)
|
17 |
+
|
18 |
+
# Get top 5 similar movies
|
19 |
+
top_5_similar_movies = movie_similarity_model.compare_input_vs_dataset(input_sentence_embeddings, input_genres_embeddings)
|
20 |
+
|
21 |
+
return top_5_similar_movies
|
22 |
+
|
23 |
+
# Define Gradio interface
|
24 |
+
input_themes = gr.List(headers=["Themes"], label="Input Movie Themes (array of strings)")
|
25 |
+
input_genres = gr.List(headers=["Genres"], label="Input Movie Genres (array of strings)")
|
26 |
+
output = gr.Dataframe(type="pandas", label="Top 5 Similar Movies")
|
27 |
+
|
28 |
+
interface = gr.Interface(fn=get_top_5_similar_movies, inputs=[input_themes, input_genres], outputs=output)
|
29 |
+
|
30 |
+
# Launch the Gradio interface
|
31 |
+
interface.launch()
|
genres_embeddings.pt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:c70d517b5cd91db2a14677e2c2162d14ee13a5f1cdbf0c2e996f411b9752078b
|
3 |
+
size 2881850
|
requirements.txt
ADDED
@@ -0,0 +1,4 @@
|
|
|
|
|
|
|
|
|
|
|
1 |
+
torch == 2.4.0
|
2 |
+
transformers == 4.44.0
|
3 |
+
numpy == 2.0.1
|
4 |
+
pandas == 2.2.2
|
tac_embeddings.pt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:09d628a47bbc59f74def388861a49e27c9401bb67d9b66bfd1daaee7d9d006ad
|
3 |
+
size 22871834
|