Spaces:
Runtime error
Runtime error
# -*- coding: utf-8 -*- | |
"""TfidfRecommender.ipynb | |
Automatically generated by Colaboratory. | |
Original file is located at | |
https://colab.research.google.com/drive/1pgFsGrn_MiauSCowY6fVgY1yq8vM3WRJ | |
""" | |
from sklearn.feature_extraction.text import TfidfVectorizer | |
from sklearn.metrics.pairwise import linear_kernel, cosine_similarity | |
from transformers import BertTokenizer | |
import re | |
import unicodedata | |
import pandas as pd | |
import numpy as np | |
import nltk | |
from nltk.stem.porter import PorterStemmer | |
class TfidfRecommender : | |
def __init__(self, df, id_col, text_col, tokenization_method) : | |
"""Initialize model parameters | |
Args: | |
id_col (str): Name of column containing item IDs. | |
tokenization_method (str): ['none','nltk','bert','scibert'] option for tokenization method. | |
""" | |
self.id_col = id_col | |
self.text_col = text_col | |
self.df = df | |
if tokenization_method.lower() not in ["none", "nltk", "bert", "scibert"]: | |
raise ValueError( | |
'Tokenization method must be one of ["none" | "nltk" | "bert" | "scibert"]' | |
) | |
self.tokenization_method = tokenization_method.lower() | |
# Initialize other variables used in this class | |
self.tf = TfidfVectorizer() | |
self.tfidf_matrix = dict() | |
self.tokens = dict() | |
self.stop_words = frozenset() | |
self.recommendations = dict() | |
self.top_k_recommendations = pd.DataFrame() | |
def __clean_text (self, text, for_Bert=False, verbose=False) : | |
try: | |
# Remove new line and tabs | |
clean = text.replace("\n", " ") | |
clean = clean.replace("\t", " ") | |
clean = clean.replace("\r", " ") | |
clean = clean.replace("Â\xa0", "") # non-breaking space | |
# Remove all punctuation and special characters | |
# clean = re.sub( | |
# r"([^\s\w]|_)+", "", clean | |
# ) # noqa W695 invalid escape sequence '\s' | |
# If you want to keep some punctuation, see below commented out example | |
clean = re.sub(r'([^,.:\s\w\-]|_)+','', clean) | |
# Skip further processing if the text will be used in BERT tokenization | |
if for_Bert is False: | |
# Lower case | |
clean = clean.lower() | |
clean = re.sub( | |
r"([^\s\w]|_)+", "", clean | |
) | |
except Exception: | |
if verbose : | |
print("Cannot clean non-existent text") | |
clean = "" | |
return clean | |
def _clean_df (self): | |
self.df = self.df.replace(np.nan, "", regex=True) | |
# df[new_col_name] = df[cols_to_clean].apply(lambda cols: " ".join(cols), axis=1) | |
# Check if for BERT tokenization | |
if self.tokenization_method in ["bert", "scibert"]: | |
for_BERT = True | |
else: | |
for_BERT = False | |
# Clean the text in the dataframe | |
self.df[self.text_col] = self.df[self.text_col].map( | |
lambda x: self.__clean_text(x, for_BERT) | |
) | |
def tokenize_text (self, ngram_range=(1, 3), min_df=0) : | |
"""Tokenize the input text. | |
Args: | |
df_clean (pandas.DataFrame): Dataframe with cleaned text in the new column. | |
text_col (str): Name of column containing the cleaned text. | |
ngram_range (tuple of int): The lower and upper boundary of the range of n-values for different n-grams to be extracted. | |
min_df (int): When building the vocabulary ignore terms that have a document frequency strictly lower than the given threshold. | |
Returns: | |
TfidfVectorizer, pandas.Series: | |
- Scikit-learn TfidfVectorizer object defined in `.tokenize_text()`. | |
- Each row contains tokens for respective documents separated by spaces. | |
""" | |
self._clean_df() | |
vectors = self.df[self.text_col] | |
if self.tokenization_method in ["bert", "scibert"] : | |
# vectorizer | |
tf = TfidfVectorizer( | |
analyzer="word", | |
ngram_range=ngram_range, | |
min_df=min_df, | |
stop_words="english", | |
) | |
if self.tokenization_method == "bert": | |
bert_method = "bert-base-cased" | |
elif self.tokenization_method == "scibert": | |
bert_method = "allenai/scibert_scivocab_cased" | |
# Load pre-trained bert model (vocabulary) | |
tokenizer = BertTokenizer.from_pretrained(bert_method) | |
# tokenization | |
vectors_tokenized = vectors.copy() | |
for i in range(0, len(vectors)): | |
vectors_tokenized[i] = " ".join(tokenizer.tokenize(vectors[i])) | |
elif self.tokenization_method == "nltk": | |
# NLTK Stemming | |
token_dict = {} # noqa: F841 | |
stemmer = PorterStemmer() | |
def stem_tokens(tokens, stemmer): | |
stemmed = [] | |
for item in tokens: | |
stemmed.append(stemmer.stem(item)) | |
return stemmed | |
def tokenize(text): | |
tokens = nltk.word_tokenize(text) | |
stems = stem_tokens(tokens, stemmer) | |
return stems | |
# The tokenization using a custom tokenizer is applied in the fit function | |
tf = TfidfVectorizer( | |
tokenizer=tokenize, | |
analyzer="word", | |
ngram_range=ngram_range, | |
min_df=min_df, | |
stop_words="english", | |
) | |
vectors_tokenized = vectors | |
elif self.tokenization_method == "none": | |
# No tokenization applied | |
tf = TfidfVectorizer( | |
analyzer="word", | |
ngram_range=ngram_range, | |
min_df=min_df, | |
stop_words="english", | |
) | |
vectors_tokenized = vectors | |
# Save to class variable | |
self.tf = tf | |
return tf, vectors_tokenized | |
def fit (self, tf, vectors_tokenized) : | |
self.tfidf_matrix = tf.fit_transform(vectors_tokenized) | |
def get_tokens (self) : | |
try: | |
self.tokens = self.tf.vocabulary_ | |
except Exception: | |
self.tokens = "Run .tokenize_text() and .fit_tfidf() first" | |
return self.tokens | |
def get_stop_words (self) : | |
try: | |
self.stop_words = self.tf.get_stop_words() | |
except Exception: | |
self.stop_words = "Run .tokenize_text() and .fit_tfidf() first" | |
return self.stop_words | |
def recommend_k_items (self, title, k) : | |
idx = self.df[self.df['title'] == title].index[0] | |
cosine_sim = cosine_similarity(self.tfidf_matrix[int(idx)], self.tfidf_matrix) | |
similarity_scores = list(enumerate(cosine_sim[0])) | |
similarity_scores = sorted(similarity_scores, key=lambda x: x[1], reverse=True) | |
similarity_scores = similarity_scores[1: k + 1] | |
movie_indices = [i[0] for i in similarity_scores] | |
return self.df.iloc[movie_indices]['id'] | |
d = pd.read_csv('/content/drive/MyDrive/Rec/data/cleaned/descriptions.csv') | |
model = TfidfRecommender(d,'id','description', 'bert') | |
tf, vec = model.tokenize_text() | |
model.fit(tf, vec) | |
model.recommend_k_items ('Toy Story', 5) | |