Spaces:
Build error
Build error
File size: 7,338 Bytes
1158955 b2ac188 1158955 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 |
# -*- coding: utf-8 -*-
"""HS_Text_REC_Games_Gradio_Blocks.ipynb
Automatically generated by Colaboratory.
Original file is located at
https://colab.research.google.com/drive/19yJ8RC70IDljwSmPlqtOzWz192gwLAHF
"""
pip install scikit-learn
import pandas as pd
import numpy as np
from fuzzywuzzy import fuzz
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import gradio as gr
df = pd.read_csv("Metacritic_Reviews_Only.csv", error_bad_lines=False, encoding='utf-8')
#Remove title from review
def remove_title(row):
game_title = row['Game Title']
body_text = row['Reviews']
new_doc = body_text.replace(game_title, "")
return new_doc
df['Reviews'] = df.apply(remove_title, axis=1)
#drop redundant column
df = df.drop(['Unnamed: 0'], axis=1)
df.dropna(inplace=True) #Drop Null Reviews
# Instantiate the vectorizer object to the vectorizer variable
#Minimum word count 2 to be included, words that appear in over 70% of docs should not be included
vectorizer = TfidfVectorizer(min_df=2, max_df=0.7)
# Fit and transform the plot column
vectorized_data = vectorizer.fit_transform(df['Reviews'])
# Create Dataframe from TF-IDFarray
tfidf_df = pd.DataFrame(vectorized_data.toarray(), columns=vectorizer.get_feature_names())
# Assign the game titles to the index
tfidf_df.index = df['Game Title']
# Find the cosine similarity measures between all game and assign the results to cosine_similarity_array.
cosine_similarity_array = cosine_similarity(tfidf_df)
# Create a DataFrame from the cosine_similarity_array with tfidf_df.index as its rows and columns.
cosine_similarity_df = pd.DataFrame(cosine_similarity_array, index=tfidf_df.index, columns=tfidf_df.index)
# Find the values for the game Batman: Arkham City
cosine_similarity_series = cosine_similarity_df.loc['Batman: Arkham City']
# Sort these values highest to lowest
ordered_similarities = cosine_similarity_series.sort_values(ascending=False)
# Print the results
print(ordered_similarities)
# create a function to find the closest title
def matching_score(a,b):
#fuzz.ratio(a,b) calculates the Levenshtein Distance between a and b, and returns the score for the distance
return fuzz.ratio(a,b)
# exactly the same, the score becomes 100
#Convert index to title_year
def get_title_from_index(index):
return df[df.index == index]['Game Title'].values[0]
# A function to return the most similar title to the words a user type
# Without this, the recommender only works when a user enters the exact title which the data has.
def find_closest_title(title):
#matching_score(a,b) > a is the current row, b is the title we're trying to match
leven_scores = list(enumerate(df['Game Title'].apply(matching_score, b=title))) #[(0, 30), (1,95), (2, 19)~~] A tuple of distances per index
sorted_leven_scores = sorted(leven_scores, key=lambda x: x[1], reverse=True) #Sorts list of tuples by distance [(1, 95), (3, 49), (0, 30)~~]
closest_title = get_title_from_index(sorted_leven_scores[0][0])
distance_score = sorted_leven_scores[0][1]
return closest_title, distance_score
# Bejeweled Twist, 100
def find_closest_titles(title):
leven_scores = list(enumerate(df['Game Title'].apply(matching_score, b=title))) #[(0, 30), (1,95), (2, 19)~~] A tuple of distances per index
sorted_leven_scores = sorted(leven_scores, key=lambda x: x[1], reverse=True) #Sorts list of tuples by distance [(1, 95), (3, 49), (0, 30)~~]
closest_titles = [get_title_from_index(sorted_leven_scores[i][0]) for i in range(5)]
distance_scores = [sorted_leven_scores[i][1] for i in range(5)]
return closest_titles, distance_scores
# Bejeweled Twist, 100
def recommend_games_v1(game1, game2, game3, max_results):
#Counter for Ranking
number = 1
print('Recommended because you played {}, {} and {}:\n'.format(game1, game2, game3))
list_of_games_enjoyed = [game1, game2, game3]
games_enjoyed_df = tfidf_df.reindex(list_of_games_enjoyed)
user_prof = games_enjoyed_df.mean()
tfidf_subset_df = tfidf_df.drop([game1, game2, game3], axis=0)
similarity_array = cosine_similarity(user_prof.values.reshape(1, -1), tfidf_subset_df)
similarity_df = pd.DataFrame(similarity_array.T, index=tfidf_subset_df.index, columns=["similarity_score"])
# Sort the values from high to low by the values in the similarity_score
sorted_similarity_df = similarity_df.sort_values(by="similarity_score", ascending=False)
number = 0
rank = 1
rank_range = []
name_list = []
score_list = []
for n in sorted_similarity_df.index:
if rank <= max_results:
rank_range.append(rank)
name_list.append(n)
score_list.append(str(round(sorted_similarity_df.iloc[number]['similarity_score']*100,2)) + "% ") #format score as a percentage
number+=1
rank +=1
#Turn lists into a dictionary
data = {'Rank': rank_range, 'Game Title': name_list, '% Match': score_list}
rec_table = pd.DataFrame.from_dict(data) #Convert dictionary into dataframe
rec_table.set_index('Rank', inplace=True) #Make Rank column the index
return rec_table
demo = gr.Blocks()
with demo:
gr.Markdown(
"""
# Game Recommendations
Input 3 games you enjoyed playing and use the dropdown to confirm your selections. Hopefully they are registered in the database. Once all 3 have been chosen, please generate your recommendations.
"""
)
options = ['Dragonball', 'Batman', 'Tekken']
def Dropdown_list(x):
new_options = [*options, x + " Remastered", x + ": The Remake", x + ": Game of the Year Edition", x + " Steelbook Edition"]
return gr.Dropdown.update(choices=new_options)
with gr.Column(visible=True):
first_entry = gr.Textbox(label="Game Title 1")
first_dropdown = gr.Dropdown(choices=[], label="Closest Matches")
update_first = gr.Button("Match Closest Title 1")
with gr.Column(visible=True):
second_entry = gr.Textbox(label="Game Title 2")
second_dropdown = gr.Dropdown(label="Closest Matches")
update_second = gr.Button("Match Closest Title 2")
with gr.Column(visible=True):
third_entry = gr.Textbox(label="Game Title 3")
third_dropdown = gr.Dropdown(label="Closest Matches")
update_third = gr.Button("Match Closest Title 3")
with gr.Row():
slider = gr.Slider(1, 20, step=1)
with gr.Row():
generate = gr.Button("Generate")
results = gr.Dataframe(label="Top Results")
def filter_matches(entry):
top_matches = find_closest_titles(entry)
top_matches = list(top_matches[0])
return gr.Dropdown.update(choices=top_matches) #, gr.update(visible=True)
def new_match(text):
top_match = find_closest_title(text)
return text
first_entry.change(new_match, inputs=first_entry, outputs=first_dropdown)
update_first.click(filter_matches, inputs=first_dropdown, outputs=first_dropdown)
second_entry.change(new_match, inputs=second_entry, outputs=second_dropdown)
update_second.click(filter_matches, inputs=second_dropdown, outputs=second_dropdown)
third_entry.change(new_match, inputs=third_entry, outputs=third_dropdown)
update_third.click(filter_matches, inputs=third_dropdown, outputs=third_dropdown)
generate.click(recommend_games_v1, inputs=[first_dropdown, second_dropdown, third_dropdown, slider], outputs=results)
demo.launch() |