Spaces:
Sleeping
Sleeping
# Necessary libraries | |
import pandas as pd | |
import numpy as np | |
from sklearn.feature_extraction.text import TfidfVectorizer | |
from sklearn.metrics.pairwise import cosine_similarity | |
import re | |
import gradio as gr | |
import json | |
import nltk | |
from nltk.corpus import stopwords | |
from nltk.stem import WordNetLemmatizer | |
from nltk.tokenize import word_tokenize | |
from multiprocessing import Pool | |
import logging | |
# Set up logging | |
logging.basicConfig(level=logging.INFO) | |
# Download necessary NLTK data | |
logging.info("Downloading NLTK data...") | |
nltk.download('punkt') | |
nltk.download('wordnet') | |
nltk.download('stopwords') | |
logging.info("NLTK data downloaded successfully.") | |
# Initialize a WordNet lemmatizer | |
logging.info("Initializing WordNet lemmatizer...") | |
lemmatizer = WordNetLemmatizer() | |
logging.info("WordNet lemmatizer initialized successfully.") | |
# Preprocessing functions | |
def remove_digits(s): | |
"""Remove digits from a string.""" | |
return re.sub(r'[\d]', '', str(s)) | |
def remove_ads(s): | |
"""Remove the word 'ADVERTISEMENT' from a string.""" | |
return re.sub(r'ADVERTISEMENT', '', str(s)) | |
stop_words = set(stopwords.words('english')) | |
def preprocess_text(text): | |
"""Preprocess text by tokenizing, converting to lower case, removing punctuation and stopwords, and lemmatizing.""" | |
# Tokenize and convert to lower case | |
words = word_tokenize(text.lower()) | |
# Remove punctuation and special characters | |
words = [word for word in words if word.isalpha()] | |
# Remove stopwords during tokenization | |
words = [word for word in words if word not in stop_words] | |
# Lemmatize the words | |
words = [lemmatizer.lemmatize(word) for word in words] | |
return ' '.join(words) | |
# Function to load data | |
def load_data(file_path): | |
"""Load data from a JSON file and preprocess it.""" | |
logging.info(f"Loading data from {file_path}...") | |
data = pd.read_json(file_path).T | |
data = data[['title', 'ingredients', 'instructions']].dropna(how='any') | |
data['ingredients'] = data['ingredients'].apply(remove_digits) | |
data['ingredients'] = data['ingredients'].apply(remove_ads) | |
data['ingredients'] = data['ingredients'].apply(preprocess_text) | |
logging.info(f"Data loaded from {file_path} successfully.") | |
return data | |
#Path to data files | |
file_paths = [ | |
'recipes_raw_nosource_ar.json', | |
'recipes_raw_nosource_epi.json', | |
'recipes_raw_nosource_fn.json' | |
] | |
# Load and concatenate data from all files | |
logging.info("Loading and concatenating data from all files...") | |
data = pd.concat([load_data(file_path) for file_path in file_paths]) | |
data.index = range(len(data)) | |
logging.info("Data loaded and concatenated successfully.") | |
# Multiprocessing pool to apply the preprocessing function to each ingredient | |
logging.info("Applying preprocessing function to each ingredient...") | |
with Pool() as pool: | |
data['ingredients'] = pool.map(preprocess_text, data['ingredients']) | |
logging.info("Preprocessing function applied successfully.") | |
# Vectorize the ingredients text | |
logging.info("Vectorizing the ingredients text...") | |
vectorizer = TfidfVectorizer() | |
tfidf_matrix = vectorizer.fit_transform(data['ingredients']) | |
logging.info("Ingredients text vectorized successfully.") | |
# Function to recommend recipes | |
def recommend_recipes(input_ingredients, n=5): | |
"""Recommend recipes based on input ingredients.""" | |
logging.info("Recommending recipes...") | |
# Preprocess and vectorize the input ingredients | |
logging.info("Preprocessing and vectorizing the input ingredients...") | |
input_ingredients = preprocess_text(input_ingredients) | |
input_vector = vectorizer.transform([input_ingredients]) | |
logging.info("Input ingredients preprocessed and vectorized successfully.") | |
# Compute cosine similarity between input and all recipes | |
logging.info("Computing cosine similarity between input and all recipes...") | |
cosine_similarities = cosine_similarity(input_vector, tfidf_matrix).flatten() | |
logging.info("Cosine similarity computed successfully.") | |
# Get indices of recipes with highest similarity | |
logging.info("Getting indices of recipes with highest similarity...") | |
top_indices = cosine_similarities.argsort()[:-n-1:-1] | |
logging.info("Indices of recipes with highest similarity obtained successfully.") | |
# Return full recipes of most similar recipes | |
logging.info("Returning full recipes of most similar recipes...") | |
recommended_recipes = [] | |
for i, index in enumerate(top_indices, start=1): | |
recipe = data.iloc[index] | |
recipe_dict = { | |
"Title": recipe['title'], | |
"Ingredients": recipe['ingredients'], | |
"Instructions": recipe['instructions'] | |
} | |
# Format the recipe with numbering | |
recipe_md = f"{i}. {recipe_dict['Title']}\n\n_Ingredients:_\n\n{recipe_dict['Ingredients']}\n\n_Instructions:_\n\n{recipe_dict['Instructions']}" | |
recommended_recipes.append(recipe_md) | |
logging.info("Full recipes of most similar recipes returned successfully.") | |
# Join the recommended recipes with appropriate formatting | |
recommended_recipes_str = "\n\n\n".join(recommended_recipes) | |
return recommended_recipes_str | |
# Create and return Gradio interface | |
logging.info("Creating Gradio interface...") | |
iface = gr.Interface(fn=recommend_recipes, | |
inputs=gr.Textbox(lines=2, label="Enter Ingredients"), | |
outputs="textbox") | |
logging.info("Gradio interface created successfully.") | |
iface.launch() | |