Spaces:
Sleeping
Sleeping
File size: 8,696 Bytes
05f81dc 9bb02cd 0fcc692 bc8b213 976563d 42ac9eb db70d75 6cc33f4 05f81dc db70d75 05f81dc f75ddf6 db70d75 05f81dc f33a59a eca35fa 413670f db70d75 7b72f69 413670f 05f81dc f33a59a 8c784dc 9bb02cd 8c784dc 9bb02cd f33a59a 8b40bd8 f33a59a 8d04a78 42ac9eb 9bb02cd f33a59a 05f81dc bc8b213 1ee593c 8c784dc 1b2e0f1 db70d75 976563d 05f81dc 745476b 2ead8af 05f81dc c387786 5f6939a f75ddf6 05f81dc 5f6939a d2393e8 05f81dc 5f6939a f75ddf6 d5b07e0 05f81dc f33a59a d69de0d 49218d6 f33a59a 56d95c6 5f6939a d69de0d 49218d6 f33a59a eca35fa 6db2cf3 1ee593c bc8b213 4112189 1ee593c 976563d 4112189 bc8b213 976563d 4112189 56d95c6 976563d 05f81dc db70d75 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 |
import pandas as pd
import streamlit as st
from difflib import SequenceMatcher
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from Levenshtein import distance as levenshtein_distance
import matplotlib.pyplot as plt
import seaborn as sns
ms = st.session_state
if "themes" not in ms:
ms.themes = {"current_theme": "light",
"refreshed": True,
"light": {"theme.base": "dark",
"theme.backgroundColor": "black",
"theme.primaryColor": "#c98bdb",
"theme.secondaryBackgroundColor": "#5591f5",
"theme.textColor": "white",
"theme.textColor": "white",
"button_face": "π"},
"dark": {"theme.base": "light",
"theme.backgroundColor": "white",
"theme.primaryColor": "#5591f5",
"theme.secondaryBackgroundColor": "#82E1D7",
"theme.textColor": "#0a1464",
"button_face": "π"},
}
def ChangeTheme():
previous_theme = ms.themes["current_theme"]
tdict = ms.themes["light"] if ms.themes["current_theme"] == "light" else ms.themes["dark"]
for vkey, vval in tdict.items():
if vkey.startswith("theme"): st._config.set_option(vkey, vval)
ms.themes["refreshed"] = False
if previous_theme == "dark": ms.themes["current_theme"] = "light"
elif previous_theme == "light": ms.themes["current_theme"] = "dark"
btn_face = ms.themes["light"]["button_face"] if ms.themes["current_theme"] == "light" else ms.themes["dark"]["button_face"]
st.button(btn_face, on_click=ChangeTheme)
if ms.themes["refreshed"] == False:
ms.themes["refreshed"] = True
st.rerun()
def read_csv_or_excel(file):
# Read CSV or Excel file
if file.name.endswith('.csv'):
return pd.read_csv(file)
elif file.name.endswith('.xlsx') or file.name.endswith('.xls'):
return pd.read_excel(file)
else:
raise ValueError("Unsupported file format. Only CSV and Excel files are supported.")
def find_exact_match(df1, df2, column_name):
# Ensure the column for merging has the same data type
df1[column_name] = df1[column_name].astype(str).str.strip()
df2[column_name] = df2[column_name].astype(str).str.strip()
# Find rows with exact matches in the specified column
matches = pd.merge(df1, df2, on=column_name, how='inner')
return matches
def find_similar_texts(df1, df2, column_name, threshold=0.3):
# Find rows with similar texts in the specified column, excluding exact matches
similar_texts = []
exact_matches = []
# Convert numeric values to strings
df1[column_name] = df1[column_name].astype(str)
df2[column_name] = df2[column_name].astype(str)
# Concatenate texts from both dataframes
all_texts = df1[column_name].tolist() + df2[column_name].tolist()
# Compute TF-IDF vectors
vectorizer = TfidfVectorizer()
tfidf_matrix = vectorizer.fit_transform(all_texts)
# Compute cosine similarity matrix
similarity_matrix = cosine_similarity(tfidf_matrix, tfidf_matrix)
# Iterate over pairs of rows to find similar texts
for i, row1 in df1.iterrows():
for j, row2 in df2.iterrows():
similarity = similarity_matrix[i, len(df1) + j]
if similarity >= threshold:
# Calculate Levenshtein distance between strings
distance = levenshtein_distance(row1[column_name], row2[column_name])
max_length = max(len(row1[column_name]), len(row2[column_name]))
similarity_score = 1 - (distance / max_length)
if similarity_score >= threshold:
if similarity == 1: # Exact match
exact_matches.append((i, j, row1[column_name], row2[column_name]))
elif similarity < 0.99: # Similar but not the same
similar_texts.append((i, j, row1[column_name], row2[column_name]))
return similar_texts, exact_matches
def plot_correlation(df, column):
plt.figure(figsize=(8, 6))
plt.scatter(df.index, df[column])
plt.xlabel("Index")
plt.ylabel(column)
plt.title(f"Correlation Plot of {column}")
return plt.gcf() # Return the matplotlib figure
st.set_option('deprecation.showPyplotGlobalUse', False)
def plot_correlation_matrix(df):
# Filter for numeric columns, if the DataFrame has non-numeric columns
numeric_df = df.select_dtypes(include=['number'])
correlation_matrix = numeric_df.corr()
# Plotting the heatmap
plt.figure(figsize=(10, 8))
sns.heatmap(correlation_matrix, annot=True, fmt=".2f", cmap='coolwarm', cbar=True, linewidths=0.5)
plt.title("Correlation Matrix")
plt.xticks(rotation=45, ha="right")
plt.yticks(rotation=0)
plt.tight_layout() # Adjusts plot to ensure everything fits without overlap
st.pyplot() # Use Streamlit's method to display the plot
def main():
st.title("Item Comparison App")
# Upload files
st.header("Upload Files")
warehouse_file = st.file_uploader("Upload Warehouse Item Stocks (CSV or Excel)")
industry_file = st.file_uploader("Upload Industry Item Stocks (CSV or Excel)")
if warehouse_file is not None and industry_file is not None:
# Read files
warehouse_df = read_csv_or_excel(warehouse_file)
industry_df = read_csv_or_excel(industry_file)
# Get column names
warehouse_columns = warehouse_df.columns.tolist()
industry_columns = industry_df.columns.tolist()
# Select columns using dropdowns
st.header("Select Columns")
warehouse_column = st.selectbox("Choose column from warehouse item stocks:", warehouse_columns)
industry_column = st.selectbox("Choose column from industry item stocks:", industry_columns)
# Compare button
if st.button("Compare"):
# Find exact matches
exact_match = find_exact_match(warehouse_df, industry_df, warehouse_column)
# Find similar texts
similar_texts, exact_matches = find_similar_texts(warehouse_df, industry_df, warehouse_column)
# Display results
st.header("Exact Matches")
st.write(exact_match)
# Display exact matches
st.header("Exact Matches Compare")
for match in exact_matches:
st.write(f"Row {match[0]+2} in warehouse item stocks is exactly the same as Row {match[1]+2} in industry item stocks:")
st.write(f"Warehouse: {match[2]}")
st.write(f"Industry: {match[3]}")
st.write(f"____________________")
st.write()
# Display similar texts
st.header("Similar (but Not Same) Texts")
for text_pair in similar_texts:
st.write(f"Row {text_pair[0]+2} in warehouse item stocks is similar to Row {text_pair[1]+2} in industry item stocks:")
st.write(f"Warehouse: {text_pair[2]}")
st.write(f"Industry: {text_pair[3]}")
st.write(f"____________________")
st.write()
if warehouse_df[warehouse_column].dtype != "object" and industry_df[industry_column].dtype != "object":
# Calculate correlation
correlation = warehouse_df[warehouse_column].corr(industry_df[industry_column])
st.header("Correlation")
st.write(f"The correlation between {warehouse_column} in warehouse item stocks and {industry_column} in industry item stocks is: {correlation}")
st.write()
# Show correlation plot for each dataset
if st.button("Correlation for each dataset"):
st.subheader("Correlation Plot for 1st Dataset")
warehouse_corr_plot = plot_correlation(warehouse_df, warehouse_column)
st.pyplot(warehouse_corr_plot)
st.subheader("Correlation Plot for 2nd Dataset")
industry_corr_plot = plot_correlation(industry_df, industry_column)
st.pyplot(industry_corr_plot)
st.subheader("Correlation Matrix for 1st Dataset")
plot_correlation_matrix(warehouse_df)
st.subheader("Correlation Matrix for 2nd Dataset")
plot_correlation_matrix(industry_df)
if __name__ == "__main__":
main() |