Spaces:

Mattral
/

Excel-Match-Analysis

Sleeping

App Files Files Community

Mattral commited on Apr 26, 2024

Commit

f33a59a

verified ·

1 Parent(s): e2e62c4

Update app.py

Browse files

Files changed (1) hide show

app.py +26 -18

app.py CHANGED Viewed

@@ -59,16 +59,14 @@ def read_csv_or_excel(file):
 def find_exact_matches(df1, df2, column_name):
     # Find rows with exact matches in the specified column
     matches = pd.merge(df1, df2, on=column_name, how='inner')
-    return
-def find_similar_texts(df1, df2, column_name, exact_matches, threshold=0.3):
-    # Find rows with similar texts in the specified column, excluding exact matches
     similar_texts = []
-    exact_match_indices = set(exact_matches.index.tolist())
     # Concatenate texts from both dataframes
     all_texts = df1[column_name].astype(str).tolist() + df2[column_name].astype(str).tolist()
@@ -82,17 +80,19 @@ def find_similar_texts(df1, df2, column_name, exact_matches, threshold=0.3):
     # Iterate over pairs of rows to find similar texts
     for i, row1 in df1.iterrows():
         for j, row2 in df2.iterrows():
-            if i not in exact_match_indices and j not in exact_match_indices:
-                similarity = similarity_matrix[i, len(df1) + j]
-                if similarity >= threshold and similarity < 1:  # Exclude exact matches
-                    # Calculate Levenshtein distance between strings
-                    distance = levenshtein_distance(row1[column_name], row2[column_name])
-                    max_length = max(len(row1[column_name]), len(row2[column_name]))
-                    similarity_score = 1 - (distance / max_length)
-                    if similarity_score >= threshold:
                         similar_texts.append((i, j, row1[column_name], row2[column_name]))
-    return similar_texts
 def main():
@@ -130,13 +130,21 @@ def main():
             st.write(exact_matches)
             st.header("Similar (but Not Same) Texts")
             for text_pair in similar_texts:
                 st.write(f"Row {text_pair[0]} in warehouse item stocks is similar to Row {text_pair[1]} in industry item stocks:")
                 st.write(f"Warehouse: {text_pair[2]}")
                 st.write(f"Industry: {text_pair[3]}")
-                st.write
 if __name__ == "__main__":

 def find_exact_matches(df1, df2, column_name):
     # Find rows with exact matches in the specified column
     matches = pd.merge(df1, df2, on=column_name, how='inner')
+    return matches
+def find_similar_texts(df1, df2, column_name, threshold=0.3):
+    # Find rows with similar texts in the specified column, including exact matches
     similar_texts = []
+    exact_matches = []
     # Concatenate texts from both dataframes
     all_texts = df1[column_name].astype(str).tolist() + df2[column_name].astype(str).tolist()
     # Iterate over pairs of rows to find similar texts
     for i, row1 in df1.iterrows():
         for j, row2 in df2.iterrows():
+            similarity = similarity_matrix[i, len(df1) + j]
+            if similarity >= threshold:
+                # Calculate Levenshtein distance between strings
+                distance = levenshtein_distance(row1[column_name], row2[column_name])
+                max_length = max(len(row1[column_name]), len(row2[column_name]))
+                similarity_score = 1 - (distance / max_length)
+                if similarity_score >= threshold:
+                    if similarity == 1:  # Exact match
+                        exact_matches.append((i, j, row1[column_name], row2[column_name]))
+                    else:
                         similar_texts.append((i, j, row1[column_name], row2[column_name]))
+    return similar_texts, exact_matches
 def main():
             st.write(exact_matches)
+           # Display exact matches
+            st.header("Exact Matches Compare")
+            for match in exact_matches:
+                st.write(f"Row {match[0]} in warehouse item stocks is exactly the same as Row {match[1]} in industry item stocks:")
+                st.write(f"Warehouse: {match[2]}")
+                st.write(f"Industry: {match[3]}")
+                st.write()
+            # Display similar texts
             st.header("Similar (but Not Same) Texts")
             for text_pair in similar_texts:
                 st.write(f"Row {text_pair[0]} in warehouse item stocks is similar to Row {text_pair[1]} in industry item stocks:")
                 st.write(f"Warehouse: {text_pair[2]}")
                 st.write(f"Industry: {text_pair[3]}")
+                st.write()
 if __name__ == "__main__":