Same / app.py
Lovisticsdev's picture
Sf
6d7738b verified
# Necessary libraries
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import re
import gradio as gr
import json
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
from multiprocessing import Pool
import logging
# Set up logging
logging.basicConfig(level=logging.INFO)
# Download necessary NLTK data
logging.info("Downloading NLTK data...")
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('stopwords')
logging.info("NLTK data downloaded successfully.")
# Initialize a WordNet lemmatizer
logging.info("Initializing WordNet lemmatizer...")
lemmatizer = WordNetLemmatizer()
logging.info("WordNet lemmatizer initialized successfully.")
# Preprocessing functions
def remove_digits(s):
"""Remove digits from a string."""
return re.sub(r'[\d]', '', str(s))
def remove_ads(s):
"""Remove the word 'ADVERTISEMENT' from a string."""
return re.sub(r'ADVERTISEMENT', '', str(s))
stop_words = set(stopwords.words('english'))
def preprocess_text(text):
"""Preprocess text by tokenizing, converting to lower case, removing punctuation and stopwords, and lemmatizing."""
# Tokenize and convert to lower case
words = word_tokenize(text.lower())
# Remove punctuation and special characters
words = [word for word in words if word.isalpha()]
# Remove stopwords during tokenization
words = [word for word in words if word not in stop_words]
# Lemmatize the words
words = [lemmatizer.lemmatize(word) for word in words]
return ' '.join(words)
# Function to load data
def load_data(file_path):
"""Load data from a JSON file and preprocess it."""
logging.info(f"Loading data from {file_path}...")
data = pd.read_json(file_path).T
data = data[['title', 'ingredients', 'instructions']].dropna(how='any')
data['ingredients'] = data['ingredients'].apply(remove_digits)
data['ingredients'] = data['ingredients'].apply(remove_ads)
data['ingredients'] = data['ingredients'].apply(preprocess_text)
logging.info(f"Data loaded from {file_path} successfully.")
return data
#Path to data files
file_paths = [
'recipes_raw_nosource_ar.json',
'recipes_raw_nosource_epi.json',
'recipes_raw_nosource_fn.json'
]
# Load and concatenate data from all files
logging.info("Loading and concatenating data from all files...")
data = pd.concat([load_data(file_path) for file_path in file_paths])
data.index = range(len(data))
logging.info("Data loaded and concatenated successfully.")
# Multiprocessing pool to apply the preprocessing function to each ingredient
logging.info("Applying preprocessing function to each ingredient...")
with Pool() as pool:
data['ingredients'] = pool.map(preprocess_text, data['ingredients'])
logging.info("Preprocessing function applied successfully.")
# Vectorize the ingredients text
logging.info("Vectorizing the ingredients text...")
vectorizer = TfidfVectorizer()
tfidf_matrix = vectorizer.fit_transform(data['ingredients'])
logging.info("Ingredients text vectorized successfully.")
# Function to recommend recipes
def recommend_recipes(input_ingredients, n=5):
"""Recommend recipes based on input ingredients."""
logging.info("Recommending recipes...")
# Preprocess and vectorize the input ingredients
logging.info("Preprocessing and vectorizing the input ingredients...")
input_ingredients = preprocess_text(input_ingredients)
input_vector = vectorizer.transform([input_ingredients])
logging.info("Input ingredients preprocessed and vectorized successfully.")
# Compute cosine similarity between input and all recipes
logging.info("Computing cosine similarity between input and all recipes...")
cosine_similarities = cosine_similarity(input_vector, tfidf_matrix).flatten()
logging.info("Cosine similarity computed successfully.")
# Get indices of recipes with highest similarity
logging.info("Getting indices of recipes with highest similarity...")
top_indices = cosine_similarities.argsort()[:-n-1:-1]
logging.info("Indices of recipes with highest similarity obtained successfully.")
# Return full recipes of most similar recipes
logging.info("Returning full recipes of most similar recipes...")
recommended_recipes = []
for i, index in enumerate(top_indices, start=1):
recipe = data.iloc[index]
recipe_dict = {
"Title": recipe['title'],
"Ingredients": recipe['ingredients'],
"Instructions": recipe['instructions']
}
# Format the recipe with numbering
recipe_md = f"{i}. {recipe_dict['Title']}\n\n_Ingredients:_\n\n{recipe_dict['Ingredients']}\n\n_Instructions:_\n\n{recipe_dict['Instructions']}"
recommended_recipes.append(recipe_md)
logging.info("Full recipes of most similar recipes returned successfully.")
# Join the recommended recipes with appropriate formatting
recommended_recipes_str = "\n\n\n".join(recommended_recipes)
return recommended_recipes_str
# Create and return Gradio interface
logging.info("Creating Gradio interface...")
iface = gr.Interface(fn=recommend_recipes,
inputs=gr.Textbox(lines=2, label="Enter Ingredients"),
outputs="textbox")
logging.info("Gradio interface created successfully.")
iface.launch()