Spaces:

Lovisticsdev
/

Same

Sleeping

App Files Files Community

Same / app.py

Lovisticsdev

Up 5 fs

a6b3953 verified 4 months ago

raw

history blame

No virus

3.83 kB

	# Necessary libraries
	import pandas as pd
	import numpy as np
	from sklearn.feature_extraction.text import TfidfVectorizer
	from sklearn.metrics.pairwise import cosine_similarity
	import re
	import gradio as gr
	import json
	import nltk
	from nltk.corpus import stopwords
	from nltk.stem import WordNetLemmatizer
	from nltk.tokenize import word_tokenize
	from multiprocessing import Pool

	# Download necessary NLTK data
	nltk.download('punkt')
	nltk.download('wordnet')
	nltk.download('stopwords')

	# Initialize a WordNet lemmatizer
	lemmatizer = WordNetLemmatizer()

	# Preprocessing functions
	def remove_digits(s):
	"""Remove digits from a string."""
	return re.sub(r'[\d]', '', str(s))

	def remove_ads(s):
	"""Remove the word 'ADVERTISEMENT' from a string."""
	return re.sub(r'ADVERTISEMENT', '', str(s))

	stop_words = set(stopwords.words('english'))

	def preprocess_text(text):
	"""Preprocess text by tokenizing, converting to lower case, removing punctuation and stopwords, and lemmatizing."""
	# Tokenize and convert to lower case
	words = word_tokenize(text.lower())

	# Remove punctuation and special characters
	words = [word for word in words if word.isalpha()]

	# Remove stopwords during tokenization
	words = [word for word in words if word not in stop_words]

	# Lemmatize the words
	words = [lemmatizer.lemmatize(word) for word in words]

	return ' '.join(words)

	# Function to load data
	def load_data(file_path):
	"""Load data from a JSON file and preprocess it."""
	data = pd.read_json(file_path).T
	data = data[['title', 'ingredients', 'instructions']].dropna(how='any')
	data['ingredients'] = data['ingredients'].apply(remove_digits)
	data['ingredients'] = data['ingredients'].apply(remove_ads)
	data['ingredients'] = data['ingredients'].apply(preprocess_text)
	return data

	#Path to data files
	file_paths = [
	'recipes_raw_nosource_ar.json',
	'recipes_raw_nosource_epi.json',
	'recipes_raw_nosource_fn.json'
	]

	# Load and concatenate data from all files
	data = pd.concat([load_data(file_path) for file_path in file_paths])
	data.index = range(len(data))

	# Multiprocessing pool to apply the preprocessing function to each ingredient
	with Pool() as pool:
	data['ingredients'] = pool.map(preprocess_text, data['ingredients'])

	# Vectorize the ingredients text
	vectorizer = TfidfVectorizer()
	tfidf_matrix = vectorizer.fit_transform(data['ingredients'])

	def recommend_recipes(input_ingredients, n=5):
	"""Recommend recipes based on input ingredients."""
	# Preprocess and vectorize the input ingredients
	input_ingredients = preprocess_text(input_ingredients)
	input_vector = vectorizer.transform([input_ingredients])

	# Compute cosine similarity between input and all recipes
	cosine_similarities = cosine_similarity(input_vector, tfidf_matrix).flatten()

	# Get indices of recipes with highest similarity
	top_indices = cosine_similarities.argsort()[:-n-1:-1]

	# Return full recipes of most similar recipes
	recommended_recipes = []
	for i, index in enumerate(top_indices, start=1):
	recipe = data.iloc[index]
	recipe_dict = {
	"Title": recipe['title'],
	"Ingredients": recipe['ingredients'],
	"Instructions": recipe['instructions']
	}
	# Format the recipe with numbering
	recipe_md = f"{i}. {recipe_dict['Title']}\n\n_Ingredients:_\n\n{recipe_dict['Ingredients']}\n\n_Instructions:_\n\n{recipe_dict['Instructions']}"
	recommended_recipes.append(recipe_md)

	# Join the recommended recipes with appropriate formatting
	recommended_recipes_str = "\n\n\n".join(recommended_recipes)
	return recommended_recipes_str

	# Create and return Gradio interface
	iface = gr.Interface(fn=recommend_recipes, inputs="text", outputs="text")
	iface