import pandas as pd
import streamlit as st
from difflib import SequenceMatcher
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from Levenshtein import distance as levenshtein_distance
import matplotlib.pyplot as plt
import seaborn as sns


ms = st.session_state
if "themes" not in ms: 
  ms.themes = {"current_theme": "light",
                    "refreshed": True,
                    
                    "light": {"theme.base": "dark",
                              "theme.backgroundColor": "black",
                              "theme.primaryColor": "#c98bdb",
                              "theme.secondaryBackgroundColor": "#5591f5",
                              "theme.textColor": "white",
                              "theme.textColor": "white",
                              "button_face": "🌜"},

                    "dark":  {"theme.base": "light",
                              "theme.backgroundColor": "white",
                              "theme.primaryColor": "#5591f5",
                              "theme.secondaryBackgroundColor": "#82E1D7",
                              "theme.textColor": "#0a1464",
                              "button_face": "🌞"},
                    }
  

def ChangeTheme():
  previous_theme = ms.themes["current_theme"]
  tdict = ms.themes["light"] if ms.themes["current_theme"] == "light" else ms.themes["dark"]
  for vkey, vval in tdict.items(): 
    if vkey.startswith("theme"): st._config.set_option(vkey, vval)

  ms.themes["refreshed"] = False
  if previous_theme == "dark": ms.themes["current_theme"] = "light"
  elif previous_theme == "light": ms.themes["current_theme"] = "dark"


btn_face = ms.themes["light"]["button_face"] if ms.themes["current_theme"] == "light" else ms.themes["dark"]["button_face"]
st.button(btn_face, on_click=ChangeTheme)

if ms.themes["refreshed"] == False:
  ms.themes["refreshed"] = True
  st.rerun()


def read_csv_or_excel(file):
    # Read CSV or Excel file
    if file.name.endswith('.csv'):
        return pd.read_csv(file)
    elif file.name.endswith('.xlsx') or file.name.endswith('.xls'):
        return pd.read_excel(file)
    else:
        raise ValueError("Unsupported file format. Only CSV and Excel files are supported.")
      

def find_exact_match(df1, df2, column_name):
    # Ensure the column for merging has the same data type
    df1[column_name] = df1[column_name].astype(str).str.strip()
    df2[column_name] = df2[column_name].astype(str).str.strip()
    
    # Find rows with exact matches in the specified column
    matches = pd.merge(df1, df2, on=column_name, how='inner')
    return matches


def find_similar_texts(df1, df2, column_name, threshold=0.3):
    # Find rows with similar texts in the specified column, excluding exact matches
    similar_texts = []
    exact_matches = []

    # Convert numeric values to strings
    df1[column_name] = df1[column_name].astype(str)
    df2[column_name] = df2[column_name].astype(str)
    
    # Concatenate texts from both dataframes
    all_texts = df1[column_name].tolist() + df2[column_name].tolist()
    
    # Compute TF-IDF vectors
    vectorizer = TfidfVectorizer()
    tfidf_matrix = vectorizer.fit_transform(all_texts)
    
    # Compute cosine similarity matrix
    similarity_matrix = cosine_similarity(tfidf_matrix, tfidf_matrix)
    
    # Iterate over pairs of rows to find similar texts
    for i, row1 in df1.iterrows():
        for j, row2 in df2.iterrows():
            similarity = similarity_matrix[i, len(df1) + j]
            if similarity >= threshold:
                # Calculate Levenshtein distance between strings
                distance = levenshtein_distance(row1[column_name], row2[column_name])
                max_length = max(len(row1[column_name]), len(row2[column_name]))
                similarity_score = 1 - (distance / max_length)
                if similarity_score >= threshold:
                    if similarity == 1:  # Exact match
                        exact_matches.append((i, j, row1[column_name], row2[column_name]))
                    elif similarity < 0.99:  # Similar but not the same
                        similar_texts.append((i, j, row1[column_name], row2[column_name]))
    
    return similar_texts, exact_matches

def plot_correlation(df, column):
    plt.figure(figsize=(8, 6))
    plt.scatter(df.index, df[column])
    plt.xlabel("Index")
    plt.ylabel(column)
    plt.title(f"Correlation Plot of {column}")
    return plt.gcf()  # Return the matplotlib figure

st.set_option('deprecation.showPyplotGlobalUse', False)

def plot_correlation_matrix(df):
    # Filter for numeric columns, if the DataFrame has non-numeric columns
    numeric_df = df.select_dtypes(include=['number'])
    correlation_matrix = numeric_df.corr()

    # Plotting the heatmap
    plt.figure(figsize=(10, 8))
    sns.heatmap(correlation_matrix, annot=True, fmt=".2f", cmap='coolwarm', cbar=True, linewidths=0.5)
    plt.title("Correlation Matrix")
    plt.xticks(rotation=45, ha="right")
    plt.yticks(rotation=0)
    plt.tight_layout()  # Adjusts plot to ensure everything fits without overlap
    st.pyplot()  # Use Streamlit's method to display the plot

def main():
    st.title("Item Comparison App")

    # Upload files
    st.header("Upload Files")
    warehouse_file = st.file_uploader("Upload Warehouse Item Stocks (CSV or Excel)")
    industry_file = st.file_uploader("Upload Industry Item Stocks (CSV or Excel)")

    if warehouse_file is not None and industry_file is not None:
        # Read files
        warehouse_df = read_csv_or_excel(warehouse_file)
        industry_df = read_csv_or_excel(industry_file)

        # Get column names
        warehouse_columns = warehouse_df.columns.tolist()
        industry_columns = industry_df.columns.tolist()

        # Select columns using dropdowns
        st.header("Select Columns")
        warehouse_column = st.selectbox("Choose column from warehouse item stocks:", warehouse_columns)
        industry_column = st.selectbox("Choose column from industry item stocks:", industry_columns)
 
        # Compare button
        if st.button("Compare"):
            # Find exact matches
            exact_match = find_exact_match(warehouse_df, industry_df, warehouse_column)

            # Find similar texts
            similar_texts, exact_matches = find_similar_texts(warehouse_df, industry_df, warehouse_column)

            # Display results
            st.header("Exact Matches")
            st.write(exact_match)
 

           # Display exact matches
            st.header("Exact Matches Compare")
            for match in exact_matches:
                st.write(f"Row {match[0]+2} in warehouse item stocks is exactly the same as Row {match[1]+2} in industry item stocks:")
                st.write(f"Warehouse: {match[2]}")
                st.write(f"Industry: {match[3]}")
                st.write(f"____________________")
                st.write()

            # Display similar texts
            st.header("Similar (but Not Same) Texts")
            for text_pair in similar_texts:
                st.write(f"Row {text_pair[0]+2} in warehouse item stocks is similar to Row {text_pair[1]+2} in industry item stocks:")
                st.write(f"Warehouse: {text_pair[2]}")
                st.write(f"Industry: {text_pair[3]}")
                st.write(f"____________________")
                st.write()

            if warehouse_df[warehouse_column].dtype != "object" and industry_df[industry_column].dtype != "object":

                # Calculate correlation
                correlation = warehouse_df[warehouse_column].corr(industry_df[industry_column])
                st.header("Correlation")
                st.write(f"The correlation between {warehouse_column} in warehouse item stocks and {industry_column} in industry item stocks is: {correlation}")
                st.write()


        # Show correlation plot for each dataset
        if st.button("Correlation for each dataset"):
            
            st.subheader("Correlation Plot for 1st Dataset")
            warehouse_corr_plot = plot_correlation(warehouse_df, warehouse_column)
            st.pyplot(warehouse_corr_plot)
                
            st.subheader("Correlation Plot for 2nd Dataset")
            industry_corr_plot = plot_correlation(industry_df, industry_column)
            st.pyplot(industry_corr_plot)

            st.subheader("Correlation Matrix for 1st Dataset")
            plot_correlation_matrix(warehouse_df)

            st.subheader("Correlation Matrix for 2nd Dataset")
            plot_correlation_matrix(industry_df)

if __name__ == "__main__":
    main()