# Necessary libraries import pandas as pd import numpy as np from sklearn.feature_extraction.text import TfidfVectorizer from sklearn.metrics.pairwise import cosine_similarity import re import gradio as gr import json import nltk from nltk.corpus import stopwords from nltk.stem import WordNetLemmatizer from nltk.tokenize import word_tokenize from multiprocessing import Pool import logging # Set up logging logging.basicConfig(level=logging.INFO) # Download necessary NLTK data logging.info("Downloading NLTK data...") nltk.download('punkt') nltk.download('wordnet') nltk.download('stopwords') logging.info("NLTK data downloaded successfully.") # Initialize a WordNet lemmatizer logging.info("Initializing WordNet lemmatizer...") lemmatizer = WordNetLemmatizer() logging.info("WordNet lemmatizer initialized successfully.") # Preprocessing functions def remove_digits(s): """Remove digits from a string.""" return re.sub(r'[\d]', '', str(s)) def remove_ads(s): """Remove the word 'ADVERTISEMENT' from a string.""" return re.sub(r'ADVERTISEMENT', '', str(s)) stop_words = set(stopwords.words('english')) def preprocess_text(text): """Preprocess text by tokenizing, converting to lower case, removing punctuation and stopwords, and lemmatizing.""" # Tokenize and convert to lower case words = word_tokenize(text.lower()) # Remove punctuation and special characters words = [word for word in words if word.isalpha()] # Remove stopwords during tokenization words = [word for word in words if word not in stop_words] # Lemmatize the words words = [lemmatizer.lemmatize(word) for word in words] return ' '.join(words) # Function to load data def load_data(file_path): """Load data from a JSON file and preprocess it.""" logging.info(f"Loading data from {file_path}...") data = pd.read_json(file_path).T data = data[['title', 'ingredients', 'instructions']].dropna(how='any') data['ingredients'] = data['ingredients'].apply(remove_digits) data['ingredients'] = data['ingredients'].apply(remove_ads) data['ingredients'] = data['ingredients'].apply(preprocess_text) logging.info(f"Data loaded from {file_path} successfully.") return data #Path to data files file_paths = [ 'recipes_raw_nosource_ar.json', 'recipes_raw_nosource_epi.json', 'recipes_raw_nosource_fn.json' ] # Load and concatenate data from all files logging.info("Loading and concatenating data from all files...") data = pd.concat([load_data(file_path) for file_path in file_paths]) data.index = range(len(data)) logging.info("Data loaded and concatenated successfully.") # Multiprocessing pool to apply the preprocessing function to each ingredient logging.info("Applying preprocessing function to each ingredient...") with Pool() as pool: data['ingredients'] = pool.map(preprocess_text, data['ingredients']) logging.info("Preprocessing function applied successfully.") # Vectorize the ingredients text logging.info("Vectorizing the ingredients text...") vectorizer = TfidfVectorizer() tfidf_matrix = vectorizer.fit_transform(data['ingredients']) logging.info("Ingredients text vectorized successfully.") # Function to recommend recipes def recommend_recipes(input_ingredients, n=5): """Recommend recipes based on input ingredients.""" logging.info("Recommending recipes...") # Preprocess and vectorize the input ingredients logging.info("Preprocessing and vectorizing the input ingredients...") input_ingredients = preprocess_text(input_ingredients) input_vector = vectorizer.transform([input_ingredients]) logging.info("Input ingredients preprocessed and vectorized successfully.") # Compute cosine similarity between input and all recipes logging.info("Computing cosine similarity between input and all recipes...") cosine_similarities = cosine_similarity(input_vector, tfidf_matrix).flatten() logging.info("Cosine similarity computed successfully.") # Get indices of recipes with highest similarity logging.info("Getting indices of recipes with highest similarity...") top_indices = cosine_similarities.argsort()[:-n-1:-1] logging.info("Indices of recipes with highest similarity obtained successfully.") # Return full recipes of most similar recipes logging.info("Returning full recipes of most similar recipes...") recommended_recipes = [] for i, index in enumerate(top_indices, start=1): recipe = data.iloc[index] recipe_dict = { "Title": recipe['title'], "Ingredients": recipe['ingredients'], "Instructions": recipe['instructions'] } # Format the recipe with numbering recipe_md = f"{i}. {recipe_dict['Title']}\n\n_Ingredients:_\n\n{recipe_dict['Ingredients']}\n\n_Instructions:_\n\n{recipe_dict['Instructions']}" recommended_recipes.append(recipe_md) logging.info("Full recipes of most similar recipes returned successfully.") # Join the recommended recipes with appropriate formatting recommended_recipes_str = "\n\n\n".join(recommended_recipes) return recommended_recipes_str # Create and return Gradio interface logging.info("Creating Gradio interface...") iface = gr.Interface(fn=recommend_recipes, inputs=gr.Textbox(lines=2, label="Enter Ingredients"), outputs="textbox") logging.info("Gradio interface created successfully.") iface.launch()