Spaces:

Mattral
/

Excel-Match-Analysis

Sleeping

File size: 8,696 Bytes

05f81dc
 
 
9bb02cd
 
0fcc692
bc8b213
976563d
42ac9eb
db70d75
6cc33f4
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
05f81dc
 
 
 
 
 
 
 
db70d75
05f81dc
f75ddf6
db70d75
 
 
 
05f81dc
 
f33a59a
eca35fa
413670f
db70d75
 
7b72f69
413670f
05f81dc
f33a59a
 
8c784dc
 
 
 
9bb02cd
8c784dc
9bb02cd
 
 
 
 
 
 
 
 
 
 
f33a59a
 
 
 
 
 
 
8b40bd8
f33a59a
8d04a78
42ac9eb
9bb02cd
f33a59a
05f81dc
bc8b213
 
 
 
 
 
1ee593c
8c784dc
1b2e0f1
db70d75
976563d
 
 
 
 
 
 
 
 
 
 
 
 
 
05f81dc
 
 
 
 
 
 
 
 
 
 
 
 
 
745476b
 
2ead8af
05f81dc
 
 
 
c387786
5f6939a
 
 
f75ddf6
05f81dc
5f6939a
d2393e8
05f81dc
5f6939a
 
f75ddf6
d5b07e0
05f81dc
f33a59a
 
 
d69de0d
 
 
49218d6
f33a59a
 
 
56d95c6
5f6939a
d69de0d
 
 
49218d6
f33a59a
eca35fa
6db2cf3
 
 
 
 
 
 
1ee593c
bc8b213
4112189
 
1ee593c
976563d
4112189
 
bc8b213
976563d
4112189
 
56d95c6
976563d
 
 
 
 
 
05f81dc
db70d75

import pandas as pd
import streamlit as st
from difflib import SequenceMatcher
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from Levenshtein import distance as levenshtein_distance
import matplotlib.pyplot as plt
import seaborn as sns


ms = st.session_state
if "themes" not in ms: 
  ms.themes = {"current_theme": "light",
                    "refreshed": True,
                    
                    "light": {"theme.base": "dark",
                              "theme.backgroundColor": "black",
                              "theme.primaryColor": "#c98bdb",
                              "theme.secondaryBackgroundColor": "#5591f5",
                              "theme.textColor": "white",
                              "theme.textColor": "white",
                              "button_face": "🌜"},

                    "dark":  {"theme.base": "light",
                              "theme.backgroundColor": "white",
                              "theme.primaryColor": "#5591f5",
                              "theme.secondaryBackgroundColor": "#82E1D7",
                              "theme.textColor": "#0a1464",
                              "button_face": "🌞"},
                    }
  

def ChangeTheme():
  previous_theme = ms.themes["current_theme"]
  tdict = ms.themes["light"] if ms.themes["current_theme"] == "light" else ms.themes["dark"]
  for vkey, vval in tdict.items(): 
    if vkey.startswith("theme"): st._config.set_option(vkey, vval)

  ms.themes["refreshed"] = False
  if previous_theme == "dark": ms.themes["current_theme"] = "light"
  elif previous_theme == "light": ms.themes["current_theme"] = "dark"


btn_face = ms.themes["light"]["button_face"] if ms.themes["current_theme"] == "light" else ms.themes["dark"]["button_face"]
st.button(btn_face, on_click=ChangeTheme)

if ms.themes["refreshed"] == False:
  ms.themes["refreshed"] = True
  st.rerun()


def read_csv_or_excel(file):
    # Read CSV or Excel file
    if file.name.endswith('.csv'):
        return pd.read_csv(file)
    elif file.name.endswith('.xlsx') or file.name.endswith('.xls'):
        return pd.read_excel(file)
    else:
        raise ValueError("Unsupported file format. Only CSV and Excel files are supported.")
      

def find_exact_match(df1, df2, column_name):
    # Ensure the column for merging has the same data type
    df1[column_name] = df1[column_name].astype(str).str.strip()
    df2[column_name] = df2[column_name].astype(str).str.strip()
    
    # Find rows with exact matches in the specified column
    matches = pd.merge(df1, df2, on=column_name, how='inner')
    return matches




def find_similar_texts(df1, df2, column_name, threshold=0.3):
    # Find rows with similar texts in the specified column, excluding exact matches
    similar_texts = []
    exact_matches = []

    # Convert numeric values to strings
    df1[column_name] = df1[column_name].astype(str)
    df2[column_name] = df2[column_name].astype(str)
    
    # Concatenate texts from both dataframes
    all_texts = df1[column_name].tolist() + df2[column_name].tolist()
    
    # Compute TF-IDF vectors
    vectorizer = TfidfVectorizer()
    tfidf_matrix = vectorizer.fit_transform(all_texts)
    
    # Compute cosine similarity matrix
    similarity_matrix = cosine_similarity(tfidf_matrix, tfidf_matrix)
    
    # Iterate over pairs of rows to find similar texts
    for i, row1 in df1.iterrows():
        for j, row2 in df2.iterrows():
            similarity = similarity_matrix[i, len(df1) + j]
            if similarity >= threshold:
                # Calculate Levenshtein distance between strings
                distance = levenshtein_distance(row1[column_name], row2[column_name])
                max_length = max(len(row1[column_name]), len(row2[column_name]))
                similarity_score = 1 - (distance / max_length)
                if similarity_score >= threshold:
                    if similarity == 1:  # Exact match
                        exact_matches.append((i, j, row1[column_name], row2[column_name]))
                    elif similarity < 0.99:  # Similar but not the same
                        similar_texts.append((i, j, row1[column_name], row2[column_name]))
    
    return similar_texts, exact_matches

def plot_correlation(df, column):
    plt.figure(figsize=(8, 6))
    plt.scatter(df.index, df[column])
    plt.xlabel("Index")
    plt.ylabel(column)
    plt.title(f"Correlation Plot of {column}")
    return plt.gcf()  # Return the matplotlib figure

st.set_option('deprecation.showPyplotGlobalUse', False)

def plot_correlation_matrix(df):
    # Filter for numeric columns, if the DataFrame has non-numeric columns
    numeric_df = df.select_dtypes(include=['number'])
    correlation_matrix = numeric_df.corr()

    # Plotting the heatmap
    plt.figure(figsize=(10, 8))
    sns.heatmap(correlation_matrix, annot=True, fmt=".2f", cmap='coolwarm', cbar=True, linewidths=0.5)
    plt.title("Correlation Matrix")
    plt.xticks(rotation=45, ha="right")
    plt.yticks(rotation=0)
    plt.tight_layout()  # Adjusts plot to ensure everything fits without overlap
    st.pyplot()  # Use Streamlit's method to display the plot

def main():
    st.title("Item Comparison App")

    # Upload files
    st.header("Upload Files")
    warehouse_file = st.file_uploader("Upload Warehouse Item Stocks (CSV or Excel)")
    industry_file = st.file_uploader("Upload Industry Item Stocks (CSV or Excel)")

    if warehouse_file is not None and industry_file is not None:
        # Read files
        warehouse_df = read_csv_or_excel(warehouse_file)
        industry_df = read_csv_or_excel(industry_file)

        # Get column names
        warehouse_columns = warehouse_df.columns.tolist()
        industry_columns = industry_df.columns.tolist()

        # Select columns using dropdowns
        st.header("Select Columns")
        warehouse_column = st.selectbox("Choose column from warehouse item stocks:", warehouse_columns)
        industry_column = st.selectbox("Choose column from industry item stocks:", industry_columns)
 
        # Compare button
        if st.button("Compare"):
            # Find exact matches
            exact_match = find_exact_match(warehouse_df, industry_df, warehouse_column)

            # Find similar texts
            similar_texts, exact_matches = find_similar_texts(warehouse_df, industry_df, warehouse_column)

            # Display results
            st.header("Exact Matches")
            st.write(exact_match)
 

           # Display exact matches
            st.header("Exact Matches Compare")
            for match in exact_matches:
                st.write(f"Row {match[0]+2} in warehouse item stocks is exactly the same as Row {match[1]+2} in industry item stocks:")
                st.write(f"Warehouse: {match[2]}")
                st.write(f"Industry: {match[3]}")
                st.write(f"____________________")
                st.write()

            # Display similar texts
            st.header("Similar (but Not Same) Texts")
            for text_pair in similar_texts:
                st.write(f"Row {text_pair[0]+2} in warehouse item stocks is similar to Row {text_pair[1]+2} in industry item stocks:")
                st.write(f"Warehouse: {text_pair[2]}")
                st.write(f"Industry: {text_pair[3]}")
                st.write(f"____________________")
                st.write()

            if warehouse_df[warehouse_column].dtype != "object" and industry_df[industry_column].dtype != "object":

                # Calculate correlation
                correlation = warehouse_df[warehouse_column].corr(industry_df[industry_column])
                st.header("Correlation")
                st.write(f"The correlation between {warehouse_column} in warehouse item stocks and {industry_column} in industry item stocks is: {correlation}")
                st.write()


        # Show correlation plot for each dataset
        if st.button("Correlation for each dataset"):
            
            st.subheader("Correlation Plot for 1st Dataset")
            warehouse_corr_plot = plot_correlation(warehouse_df, warehouse_column)
            st.pyplot(warehouse_corr_plot)
                
            st.subheader("Correlation Plot for 2nd Dataset")
            industry_corr_plot = plot_correlation(industry_df, industry_column)
            st.pyplot(industry_corr_plot)

            st.subheader("Correlation Matrix for 1st Dataset")
            plot_correlation_matrix(warehouse_df)

            st.subheader("Correlation Matrix for 2nd Dataset")
            plot_correlation_matrix(industry_df)

if __name__ == "__main__":
    main()