File size: 3,826 Bytes
a6b3953
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
# Necessary libraries
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import re
import gradio as gr
import json
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
from multiprocessing import Pool

# Download necessary NLTK data
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('stopwords')

# Initialize a WordNet lemmatizer
lemmatizer = WordNetLemmatizer()

# Preprocessing functions
def remove_digits(s):
    """Remove digits from a string."""
    return re.sub(r'[\d]', '', str(s))

def remove_ads(s):
    """Remove the word 'ADVERTISEMENT' from a string."""
    return re.sub(r'ADVERTISEMENT', '', str(s))

stop_words = set(stopwords.words('english'))

def preprocess_text(text):
    """Preprocess text by tokenizing, converting to lower case, removing punctuation and stopwords, and lemmatizing."""
    # Tokenize and convert to lower case
    words = word_tokenize(text.lower())

    # Remove punctuation and special characters
    words = [word for word in words if word.isalpha()]

    # Remove stopwords during tokenization
    words = [word for word in words if word not in stop_words]

    # Lemmatize the words
    words = [lemmatizer.lemmatize(word) for word in words]

    return ' '.join(words)

# Function to load data
def load_data(file_path):
    """Load data from a JSON file and preprocess it."""
    data = pd.read_json(file_path).T
    data = data[['title', 'ingredients', 'instructions']].dropna(how='any')
    data['ingredients'] = data['ingredients'].apply(remove_digits)
    data['ingredients'] = data['ingredients'].apply(remove_ads)
    data['ingredients'] = data['ingredients'].apply(preprocess_text)
    return data

#Path to data files
file_paths = [
    'recipes_raw_nosource_ar.json',
    'recipes_raw_nosource_epi.json',
    'recipes_raw_nosource_fn.json'
]

# Load and concatenate data from all files
data = pd.concat([load_data(file_path) for file_path in file_paths])
data.index = range(len(data))

# Multiprocessing pool to apply the preprocessing function to each ingredient
with Pool() as pool:
    data['ingredients'] = pool.map(preprocess_text, data['ingredients'])

# Vectorize the ingredients text
vectorizer = TfidfVectorizer()
tfidf_matrix = vectorizer.fit_transform(data['ingredients'])

def recommend_recipes(input_ingredients, n=5):
    """Recommend recipes based on input ingredients."""
    # Preprocess and vectorize the input ingredients
    input_ingredients = preprocess_text(input_ingredients)
    input_vector = vectorizer.transform([input_ingredients])

    # Compute cosine similarity between input and all recipes
    cosine_similarities = cosine_similarity(input_vector, tfidf_matrix).flatten()

    # Get indices of recipes with highest similarity
    top_indices = cosine_similarities.argsort()[:-n-1:-1]

    # Return full recipes of most similar recipes
    recommended_recipes = []
    for i, index in enumerate(top_indices, start=1):
        recipe = data.iloc[index]
        recipe_dict = {
            "Title": recipe['title'],
            "Ingredients": recipe['ingredients'],
            "Instructions": recipe['instructions']
        }
        # Format the recipe with numbering
        recipe_md = f"{i}. {recipe_dict['Title']}\n\n_Ingredients:_\n\n{recipe_dict['Ingredients']}\n\n_Instructions:_\n\n{recipe_dict['Instructions']}"
        recommended_recipes.append(recipe_md)

    # Join the recommended recipes with appropriate formatting
    recommended_recipes_str = "\n\n\n".join(recommended_recipes)
    return recommended_recipes_str

# Create and return Gradio interface
iface = gr.Interface(fn=recommend_recipes, inputs="text", outputs="text")
iface