Spaces:

Lovisticsdev
/

Same

Sleeping

Same / app.py

6d7738b verified about 1 month ago

No virus

5.47 kB

	# Necessary libraries
	import pandas as pd
	import numpy as np
	from sklearn.feature_extraction.text import TfidfVectorizer
	from sklearn.metrics.pairwise import cosine_similarity
	import re
	import gradio as gr
	import json
	import nltk
	from nltk.corpus import stopwords
	from nltk.stem import WordNetLemmatizer
	from nltk.tokenize import word_tokenize
	from multiprocessing import Pool
	import logging

	# Set up logging
	logging.basicConfig(level=logging.INFO)

	# Download necessary NLTK data
	logging.info("Downloading NLTK data...")
	nltk.download('punkt')
	nltk.download('wordnet')
	nltk.download('stopwords')
	logging.info("NLTK data downloaded successfully.")

	# Initialize a WordNet lemmatizer
	logging.info("Initializing WordNet lemmatizer...")
	lemmatizer = WordNetLemmatizer()
	logging.info("WordNet lemmatizer initialized successfully.")

	# Preprocessing functions
	def remove_digits(s):
	"""Remove digits from a string."""
	return re.sub(r'[\d]', '', str(s))

	def remove_ads(s):
	"""Remove the word 'ADVERTISEMENT' from a string."""
	return re.sub(r'ADVERTISEMENT', '', str(s))

	stop_words = set(stopwords.words('english'))

	def preprocess_text(text):
	"""Preprocess text by tokenizing, converting to lower case, removing punctuation and stopwords, and lemmatizing."""
	# Tokenize and convert to lower case
	words = word_tokenize(text.lower())

	# Remove punctuation and special characters
	words = [word for word in words if word.isalpha()]

	# Remove stopwords during tokenization
	words = [word for word in words if word not in stop_words]

	# Lemmatize the words
	words = [lemmatizer.lemmatize(word) for word in words]

	return ' '.join(words)

	# Function to load data
	def load_data(file_path):
	"""Load data from a JSON file and preprocess it."""
	logging.info(f"Loading data from {file_path}...")
	data = pd.read_json(file_path).T
	data = data[['title', 'ingredients', 'instructions']].dropna(how='any')
	data['ingredients'] = data['ingredients'].apply(remove_digits)
	data['ingredients'] = data['ingredients'].apply(remove_ads)
	data['ingredients'] = data['ingredients'].apply(preprocess_text)
	logging.info(f"Data loaded from {file_path} successfully.")
	return data

	#Path to data files
	file_paths = [
	'recipes_raw_nosource_ar.json',
	'recipes_raw_nosource_epi.json',
	'recipes_raw_nosource_fn.json'
	]

	# Load and concatenate data from all files
	logging.info("Loading and concatenating data from all files...")
	data = pd.concat([load_data(file_path) for file_path in file_paths])
	data.index = range(len(data))
	logging.info("Data loaded and concatenated successfully.")

	# Multiprocessing pool to apply the preprocessing function to each ingredient
	logging.info("Applying preprocessing function to each ingredient...")
	with Pool() as pool:
	data['ingredients'] = pool.map(preprocess_text, data['ingredients'])
	logging.info("Preprocessing function applied successfully.")

	# Vectorize the ingredients text
	logging.info("Vectorizing the ingredients text...")
	vectorizer = TfidfVectorizer()
	tfidf_matrix = vectorizer.fit_transform(data['ingredients'])
	logging.info("Ingredients text vectorized successfully.")


	# Function to recommend recipes
	def recommend_recipes(input_ingredients, n=5):
	"""Recommend recipes based on input ingredients."""
	logging.info("Recommending recipes...")
	# Preprocess and vectorize the input ingredients
	logging.info("Preprocessing and vectorizing the input ingredients...")
	input_ingredients = preprocess_text(input_ingredients)
	input_vector = vectorizer.transform([input_ingredients])
	logging.info("Input ingredients preprocessed and vectorized successfully.")

	# Compute cosine similarity between input and all recipes
	logging.info("Computing cosine similarity between input and all recipes...")
	cosine_similarities = cosine_similarity(input_vector, tfidf_matrix).flatten()
	logging.info("Cosine similarity computed successfully.")

	# Get indices of recipes with highest similarity
	logging.info("Getting indices of recipes with highest similarity...")
	top_indices = cosine_similarities.argsort()[:-n-1:-1]
	logging.info("Indices of recipes with highest similarity obtained successfully.")

	# Return full recipes of most similar recipes
	logging.info("Returning full recipes of most similar recipes...")
	recommended_recipes = []
	for i, index in enumerate(top_indices, start=1):
	recipe = data.iloc[index]
	recipe_dict = {
	"Title": recipe['title'],
	"Ingredients": recipe['ingredients'],
	"Instructions": recipe['instructions']
	}
	# Format the recipe with numbering
	recipe_md = f"{i}. {recipe_dict['Title']}\n\n_Ingredients:_\n\n{recipe_dict['Ingredients']}\n\n_Instructions:_\n\n{recipe_dict['Instructions']}"
	recommended_recipes.append(recipe_md)
	logging.info("Full recipes of most similar recipes returned successfully.")

	# Join the recommended recipes with appropriate formatting
	recommended_recipes_str = "\n\n\n".join(recommended_recipes)
	return recommended_recipes_str

	# Create and return Gradio interface
	logging.info("Creating Gradio interface...")
	iface = gr.Interface(fn=recommend_recipes,
	inputs=gr.Textbox(lines=2, label="Enter Ingredients"),
	outputs="textbox")
	logging.info("Gradio interface created successfully.")
	iface.launch()