# Necessary libraries
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import re
import gradio as gr
import json
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
from multiprocessing import Pool
import logging

# Set up logging
logging.basicConfig(level=logging.INFO)

# Download necessary NLTK data
logging.info("Downloading NLTK data...")
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('stopwords')
logging.info("NLTK data downloaded successfully.")

# Initialize a WordNet lemmatizer
logging.info("Initializing WordNet lemmatizer...")
lemmatizer = WordNetLemmatizer()
logging.info("WordNet lemmatizer initialized successfully.")

# Preprocessing functions
def remove_digits(s):
    """Remove digits from a string."""
    return re.sub(r'[\d]', '', str(s))

def remove_ads(s):
    """Remove the word 'ADVERTISEMENT' from a string."""
    return re.sub(r'ADVERTISEMENT', '', str(s))

stop_words = set(stopwords.words('english'))

def preprocess_text(text):
    """Preprocess text by tokenizing, converting to lower case, removing punctuation and stopwords, and lemmatizing."""
    # Tokenize and convert to lower case
    words = word_tokenize(text.lower())

    # Remove punctuation and special characters
    words = [word for word in words if word.isalpha()]

    # Remove stopwords during tokenization
    words = [word for word in words if word not in stop_words]

    # Lemmatize the words
    words = [lemmatizer.lemmatize(word) for word in words]

    return ' '.join(words)

# Function to load data
def load_data(file_path):
    """Load data from a JSON file and preprocess it."""
    logging.info(f"Loading data from {file_path}...")
    data = pd.read_json(file_path).T
    data = data[['title', 'ingredients', 'instructions']].dropna(how='any')
    data['ingredients'] = data['ingredients'].apply(remove_digits)
    data['ingredients'] = data['ingredients'].apply(remove_ads)
    data['ingredients'] = data['ingredients'].apply(preprocess_text)
    logging.info(f"Data loaded from {file_path} successfully.")
    return data

#Path to data files
file_paths = [
    'recipes_raw_nosource_ar.json',
    'recipes_raw_nosource_epi.json',
    'recipes_raw_nosource_fn.json'
]

# Load and concatenate data from all files
logging.info("Loading and concatenating data from all files...")
data = pd.concat([load_data(file_path) for file_path in file_paths])
data.index = range(len(data))
logging.info("Data loaded and concatenated successfully.")

# Multiprocessing pool to apply the preprocessing function to each ingredient
logging.info("Applying preprocessing function to each ingredient...")
with Pool() as pool:
    data['ingredients'] = pool.map(preprocess_text, data['ingredients'])
logging.info("Preprocessing function applied successfully.")

# Vectorize the ingredients text
logging.info("Vectorizing the ingredients text...")
vectorizer = TfidfVectorizer()
tfidf_matrix = vectorizer.fit_transform(data['ingredients'])
logging.info("Ingredients text vectorized successfully.")


# Function to recommend recipes
def recommend_recipes(input_ingredients, n=5):
    """Recommend recipes based on input ingredients."""
    logging.info("Recommending recipes...")
    # Preprocess and vectorize the input ingredients
    logging.info("Preprocessing and vectorizing the input ingredients...")
    input_ingredients = preprocess_text(input_ingredients)
    input_vector = vectorizer.transform([input_ingredients])
    logging.info("Input ingredients preprocessed and vectorized successfully.")

    # Compute cosine similarity between input and all recipes
    logging.info("Computing cosine similarity between input and all recipes...")
    cosine_similarities = cosine_similarity(input_vector, tfidf_matrix).flatten()
    logging.info("Cosine similarity computed successfully.")

    # Get indices of recipes with highest similarity
    logging.info("Getting indices of recipes with highest similarity...")
    top_indices = cosine_similarities.argsort()[:-n-1:-1]
    logging.info("Indices of recipes with highest similarity obtained successfully.")

    # Return full recipes of most similar recipes
    logging.info("Returning full recipes of most similar recipes...")
    recommended_recipes = []
    for i, index in enumerate(top_indices, start=1):
        recipe = data.iloc[index]
        recipe_dict = {
            "Title": recipe['title'],
            "Ingredients": recipe['ingredients'],
            "Instructions": recipe['instructions']
        }
        # Format the recipe with numbering
        recipe_md = f"{i}. {recipe_dict['Title']}\n\n_Ingredients:_\n\n{recipe_dict['Ingredients']}\n\n_Instructions:_\n\n{recipe_dict['Instructions']}"
        recommended_recipes.append(recipe_md)
    logging.info("Full recipes of most similar recipes returned successfully.")

    # Join the recommended recipes with appropriate formatting
    recommended_recipes_str = "\n\n\n".join(recommended_recipes)
    return recommended_recipes_str

# Create and return Gradio interface
logging.info("Creating Gradio interface...")
iface = gr.Interface(fn=recommend_recipes, 
                     inputs=gr.Textbox(lines=2, label="Enter Ingredients"), 
                     outputs="textbox")
logging.info("Gradio interface created successfully.")
iface.launch()