Same / app.py
Lovisticsdev's picture
Up 5 fs
a6b3953 verified
raw
history blame
No virus
3.83 kB
# Necessary libraries
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import re
import gradio as gr
import json
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
from multiprocessing import Pool
# Download necessary NLTK data
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('stopwords')
# Initialize a WordNet lemmatizer
lemmatizer = WordNetLemmatizer()
# Preprocessing functions
def remove_digits(s):
"""Remove digits from a string."""
return re.sub(r'[\d]', '', str(s))
def remove_ads(s):
"""Remove the word 'ADVERTISEMENT' from a string."""
return re.sub(r'ADVERTISEMENT', '', str(s))
stop_words = set(stopwords.words('english'))
def preprocess_text(text):
"""Preprocess text by tokenizing, converting to lower case, removing punctuation and stopwords, and lemmatizing."""
# Tokenize and convert to lower case
words = word_tokenize(text.lower())
# Remove punctuation and special characters
words = [word for word in words if word.isalpha()]
# Remove stopwords during tokenization
words = [word for word in words if word not in stop_words]
# Lemmatize the words
words = [lemmatizer.lemmatize(word) for word in words]
return ' '.join(words)
# Function to load data
def load_data(file_path):
"""Load data from a JSON file and preprocess it."""
data = pd.read_json(file_path).T
data = data[['title', 'ingredients', 'instructions']].dropna(how='any')
data['ingredients'] = data['ingredients'].apply(remove_digits)
data['ingredients'] = data['ingredients'].apply(remove_ads)
data['ingredients'] = data['ingredients'].apply(preprocess_text)
return data
#Path to data files
file_paths = [
'recipes_raw_nosource_ar.json',
'recipes_raw_nosource_epi.json',
'recipes_raw_nosource_fn.json'
]
# Load and concatenate data from all files
data = pd.concat([load_data(file_path) for file_path in file_paths])
data.index = range(len(data))
# Multiprocessing pool to apply the preprocessing function to each ingredient
with Pool() as pool:
data['ingredients'] = pool.map(preprocess_text, data['ingredients'])
# Vectorize the ingredients text
vectorizer = TfidfVectorizer()
tfidf_matrix = vectorizer.fit_transform(data['ingredients'])
def recommend_recipes(input_ingredients, n=5):
"""Recommend recipes based on input ingredients."""
# Preprocess and vectorize the input ingredients
input_ingredients = preprocess_text(input_ingredients)
input_vector = vectorizer.transform([input_ingredients])
# Compute cosine similarity between input and all recipes
cosine_similarities = cosine_similarity(input_vector, tfidf_matrix).flatten()
# Get indices of recipes with highest similarity
top_indices = cosine_similarities.argsort()[:-n-1:-1]
# Return full recipes of most similar recipes
recommended_recipes = []
for i, index in enumerate(top_indices, start=1):
recipe = data.iloc[index]
recipe_dict = {
"Title": recipe['title'],
"Ingredients": recipe['ingredients'],
"Instructions": recipe['instructions']
}
# Format the recipe with numbering
recipe_md = f"{i}. {recipe_dict['Title']}\n\n_Ingredients:_\n\n{recipe_dict['Ingredients']}\n\n_Instructions:_\n\n{recipe_dict['Instructions']}"
recommended_recipes.append(recipe_md)
# Join the recommended recipes with appropriate formatting
recommended_recipes_str = "\n\n\n".join(recommended_recipes)
return recommended_recipes_str
# Create and return Gradio interface
iface = gr.Interface(fn=recommend_recipes, inputs="text", outputs="text")
iface