Lovisticsdev commited on
Commit
a6b3953
1 Parent(s): 6e2de00
.gitattributes CHANGED
@@ -33,3 +33,6 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ recipes_raw_nosource_ar.json filter=lfs diff=lfs merge=lfs -text
37
+ recipes_raw_nosource_epi.json filter=lfs diff=lfs merge=lfs -text
38
+ recipes_raw_nosource_fn.json filter=lfs diff=lfs merge=lfs -text
app.py ADDED
@@ -0,0 +1,110 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Necessary libraries
2
+ import pandas as pd
3
+ import numpy as np
4
+ from sklearn.feature_extraction.text import TfidfVectorizer
5
+ from sklearn.metrics.pairwise import cosine_similarity
6
+ import re
7
+ import gradio as gr
8
+ import json
9
+ import nltk
10
+ from nltk.corpus import stopwords
11
+ from nltk.stem import WordNetLemmatizer
12
+ from nltk.tokenize import word_tokenize
13
+ from multiprocessing import Pool
14
+
15
+ # Download necessary NLTK data
16
+ nltk.download('punkt')
17
+ nltk.download('wordnet')
18
+ nltk.download('stopwords')
19
+
20
+ # Initialize a WordNet lemmatizer
21
+ lemmatizer = WordNetLemmatizer()
22
+
23
+ # Preprocessing functions
24
+ def remove_digits(s):
25
+ """Remove digits from a string."""
26
+ return re.sub(r'[\d]', '', str(s))
27
+
28
+ def remove_ads(s):
29
+ """Remove the word 'ADVERTISEMENT' from a string."""
30
+ return re.sub(r'ADVERTISEMENT', '', str(s))
31
+
32
+ stop_words = set(stopwords.words('english'))
33
+
34
+ def preprocess_text(text):
35
+ """Preprocess text by tokenizing, converting to lower case, removing punctuation and stopwords, and lemmatizing."""
36
+ # Tokenize and convert to lower case
37
+ words = word_tokenize(text.lower())
38
+
39
+ # Remove punctuation and special characters
40
+ words = [word for word in words if word.isalpha()]
41
+
42
+ # Remove stopwords during tokenization
43
+ words = [word for word in words if word not in stop_words]
44
+
45
+ # Lemmatize the words
46
+ words = [lemmatizer.lemmatize(word) for word in words]
47
+
48
+ return ' '.join(words)
49
+
50
+ # Function to load data
51
+ def load_data(file_path):
52
+ """Load data from a JSON file and preprocess it."""
53
+ data = pd.read_json(file_path).T
54
+ data = data[['title', 'ingredients', 'instructions']].dropna(how='any')
55
+ data['ingredients'] = data['ingredients'].apply(remove_digits)
56
+ data['ingredients'] = data['ingredients'].apply(remove_ads)
57
+ data['ingredients'] = data['ingredients'].apply(preprocess_text)
58
+ return data
59
+
60
+ #Path to data files
61
+ file_paths = [
62
+ 'recipes_raw_nosource_ar.json',
63
+ 'recipes_raw_nosource_epi.json',
64
+ 'recipes_raw_nosource_fn.json'
65
+ ]
66
+
67
+ # Load and concatenate data from all files
68
+ data = pd.concat([load_data(file_path) for file_path in file_paths])
69
+ data.index = range(len(data))
70
+
71
+ # Multiprocessing pool to apply the preprocessing function to each ingredient
72
+ with Pool() as pool:
73
+ data['ingredients'] = pool.map(preprocess_text, data['ingredients'])
74
+
75
+ # Vectorize the ingredients text
76
+ vectorizer = TfidfVectorizer()
77
+ tfidf_matrix = vectorizer.fit_transform(data['ingredients'])
78
+
79
+ def recommend_recipes(input_ingredients, n=5):
80
+ """Recommend recipes based on input ingredients."""
81
+ # Preprocess and vectorize the input ingredients
82
+ input_ingredients = preprocess_text(input_ingredients)
83
+ input_vector = vectorizer.transform([input_ingredients])
84
+
85
+ # Compute cosine similarity between input and all recipes
86
+ cosine_similarities = cosine_similarity(input_vector, tfidf_matrix).flatten()
87
+
88
+ # Get indices of recipes with highest similarity
89
+ top_indices = cosine_similarities.argsort()[:-n-1:-1]
90
+
91
+ # Return full recipes of most similar recipes
92
+ recommended_recipes = []
93
+ for i, index in enumerate(top_indices, start=1):
94
+ recipe = data.iloc[index]
95
+ recipe_dict = {
96
+ "Title": recipe['title'],
97
+ "Ingredients": recipe['ingredients'],
98
+ "Instructions": recipe['instructions']
99
+ }
100
+ # Format the recipe with numbering
101
+ recipe_md = f"{i}. {recipe_dict['Title']}\n\n_Ingredients:_\n\n{recipe_dict['Ingredients']}\n\n_Instructions:_\n\n{recipe_dict['Instructions']}"
102
+ recommended_recipes.append(recipe_md)
103
+
104
+ # Join the recommended recipes with appropriate formatting
105
+ recommended_recipes_str = "\n\n\n".join(recommended_recipes)
106
+ return recommended_recipes_str
107
+
108
+ # Create and return Gradio interface
109
+ iface = gr.Interface(fn=recommend_recipes, inputs="text", outputs="text")
110
+ iface
recipes_raw_nosource_ar.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:93da2202eacb85ad81b50e49f9c1ceba33eb298f1c82a6d02eb59cab7d550cb5
3
+ size 49784325
recipes_raw_nosource_epi.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:08c7c8103a9c0dd114dc3fe01490fdf86ec9dee05d4db7d96504a61b5e8a886e
3
+ size 61133971
recipes_raw_nosource_fn.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ea2487a1a6f81cd395754dc355fa12c47b51e5fac21d23f26fb0fd00479307f7
3
+ size 93702755
requirements.txt ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ pandas
2
+ numpy
3
+ scikit-learn
4
+ gradio
5
+ nltk