File size: 6,026 Bytes
4fdeb0d
 
 
 
 
 
 
 
 
0f1eb8b
 
 
4fdeb0d
7b302d7
0f1eb8b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4cd9189
 
7613c44
4cd9189
 
0f1eb8b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
fb9c839
0f1eb8b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2584a60
4cd9189
 
2584a60
 
 
 
 
 
 
4cd9189
0f1eb8b
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
import pandas as pd
import numpy as np
from zipfile import ZipFile
import tensorflow as tf
from tensorflow import keras
from pathlib import Path
import matplotlib.pyplot as plt
import gradio as gr
from huggingface_hub import from_pretrained_keras
from collections import defaultdict
import math
import networkx as nx

model = from_pretrained_keras("keras-io/Node2Vec_MovieLens")

# Download the actual data from http://files.grouplens.org/datasets/movielens/ml-latest-small.zip"
movielens_data_file_url = "http://files.grouplens.org/datasets/movielens/ml-latest-small.zip"
movielens_zipped_file = keras.utils.get_file("ml-latest-small.zip", movielens_data_file_url, extract=False)
keras_datasets_path = Path(movielens_zipped_file).parents[0]
movielens_dir = keras_datasets_path / "ml-latest-small"

# Only extract the data the first time the script is run.
if not movielens_dir.exists():
    with ZipFile(movielens_zipped_file, "r") as zip:
        # Extract files
        print("Extracting all the files now...")
        zip.extractall(path=keras_datasets_path)
        print("Done!")

# Read the Movies csv
movies = pd.read_csv(f"{movielens_dir}/movies.csv")
# Create a `movieId` string.
movies["movieId"] = movies["movieId"].apply(lambda x: f"movie_{x}")

# Load ratings to a DataFrame.
ratings = pd.read_csv(f"{movielens_dir}/ratings.csv")
# Convert the `ratings` to floating point
ratings["rating"] = ratings["rating"].apply(lambda x: float(x))
# Create the `movie_id` string.
ratings["movieId"] = ratings["movieId"].apply(lambda x: f"movie_{x}")

# Implement two utility functions for the movies DataFrame.
def get_movie_title_by_id(movieId):
    return list(movies[movies.movieId == movieId].title)[0]


def get_movie_id_by_title(title):
    return list(movies[movies.title == title].movieId)[0]

# Create Weighted Edges between movies
min_rating = 5
pair_frequency = defaultdict(int)
item_frequency = defaultdict(int)

# Filter instances where rating is greater than or equal to min_rating.
rated_movies = ratings[ratings.rating >= min_rating]
# Group instances by user.
movies_grouped_by_users = list(rated_movies.groupby("userId"))
for group in movies_grouped_by_users:
    # Get a list of movies rated by the user.
    current_movies = list(group[1]["movieId"])

    for i in range(len(current_movies)):
        item_frequency[current_movies[i]] += 1
        for j in range(i + 1, len(current_movies)):
            x = min(current_movies[i], current_movies[j])
            y = max(current_movies[i], current_movies[j])
            pair_frequency[(x, y)] += 1

# Create the graph with the nodes and the edges

min_weight = 10
D = math.log(sum(item_frequency.values()))

# Create the movies undirected graph.
movies_graph = nx.Graph()
# Add weighted edges between movies.
# This automatically adds the movie nodes to the graph.
for pair in pair_frequency:
    x, y = pair
    xy_frequency = pair_frequency[pair]
    x_frequency = item_frequency[x]
    y_frequency = item_frequency[y]
    pmi = math.log(xy_frequency) - math.log(x_frequency) - math.log(y_frequency) + D
    weight = pmi * xy_frequency
    # Only include edges with weight >= min_weight.
    if weight >= min_weight:
        movies_graph.add_edge(x, y, weight=weight)
# Create vocabulary and a mapping from tokens to integer indices
vocabulary = ["NA"] + list(movies_graph.nodes)
vocabulary_lookup = {token: idx for idx, token in enumerate(vocabulary)}

# Analyze the learnt embeddings.
movie_embeddings = model.get_layer("item_embeddings").get_weights()[0]

# Find Related Movies
movie_titles = []

for uniq_mov_id in list(set(movies_graph.nodes)):
  movie_title = get_movie_title_by_id(uniq_mov_id)
  movie_titles.append(movie_title)

def find_related_movies(movie_title, k):
  k = int(k)
  query_embeddings = []
  movieId = get_movie_id_by_title(movie_title)
  token_id = vocabulary_lookup[movieId]
  query_embedding = movie_embeddings[token_id]
  query_embeddings.append(query_embedding)
  query_embeddings = np.array(query_embeddings)

  similarities = tf.linalg.matmul(
    tf.math.l2_normalize(query_embeddings),
    tf.math.l2_normalize(movie_embeddings),
    transpose_b=True,
  )
  _, indices = tf.math.top_k(similarities, k)
  indices = indices.numpy().tolist()
  similar_tokens = indices[0]
  related_movies = []

  for token in similar_tokens:
    similar_movieId = vocabulary[token]
    similar_title = get_movie_title_by_id(similar_movieId)
    related_movies.append(similar_title)

  related_movies_df = pd.DataFrame({'Related Movies':related_movies})
  return related_movies_df
        


demo = gr.Blocks()
with demo:
  gr.Markdown("""
  <div>
  <h1 style='text-align: center'>Find Related Movies</h1>
  <h2>Choose the specific movie from the dropdown and see the top k related Movies</h2>
  
  Note: The dropdown menu provides movie options from the Movielens dataset.
  </div>
  """)

  with gr.Box():
    gr.Markdown(
    """
    ### Input
    #### Select a movie to find other related movies.
    """)

    inp1 = gr.Dropdown(movie_titles)
    gr.Markdown(
    """
    <br>
    """)
    gr.Markdown(
    """
    #### Number of related movies you wanna find?
    """)
    inp2 = gr.Number()
    btn = gr.Button("Run")

  with gr.Box():
    gr.Markdown(
    """
    ### Output
    #### Top K related movies.
    """)
    df1 = gr.DataFrame(headers=["title"], datatype=["str"], interactive=False)
    
  with gr.Row():
    gr.Markdown(
                """
                <h4>Credits</h4>
                Author: <a href="https://www.linkedin.com/in/khalid-salama-24403144/"> Khalid Salama</a>.<br>
                Based on the following Keras example <a href="https://keras.io/examples/graph/node2vec_movielens/"> Graph representation learning with node2vec</a> by Khalid Salama<br>
                Check out the model <a href="https://huggingface.co/keras-io/Node2Vec_MovieLens">here</a>
                """
            )
            
  
  btn.click(fn=find_related_movies, inputs=[inp1,inp2], outputs=df1)

demo.launch(debug=True)