Mattral commited on
Commit
f33a59a
·
verified ·
1 Parent(s): e2e62c4

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +26 -18
app.py CHANGED
@@ -59,16 +59,14 @@ def read_csv_or_excel(file):
59
  def find_exact_matches(df1, df2, column_name):
60
  # Find rows with exact matches in the specified column
61
  matches = pd.merge(df1, df2, on=column_name, how='inner')
62
- return
63
 
64
 
65
-
66
-
67
- def find_similar_texts(df1, df2, column_name, exact_matches, threshold=0.3):
68
- # Find rows with similar texts in the specified column, excluding exact matches
69
  similar_texts = []
70
- exact_match_indices = set(exact_matches.index.tolist())
71
-
72
  # Concatenate texts from both dataframes
73
  all_texts = df1[column_name].astype(str).tolist() + df2[column_name].astype(str).tolist()
74
 
@@ -82,17 +80,19 @@ def find_similar_texts(df1, df2, column_name, exact_matches, threshold=0.3):
82
  # Iterate over pairs of rows to find similar texts
83
  for i, row1 in df1.iterrows():
84
  for j, row2 in df2.iterrows():
85
- if i not in exact_match_indices and j not in exact_match_indices:
86
- similarity = similarity_matrix[i, len(df1) + j]
87
- if similarity >= threshold and similarity < 1: # Exclude exact matches
88
- # Calculate Levenshtein distance between strings
89
- distance = levenshtein_distance(row1[column_name], row2[column_name])
90
- max_length = max(len(row1[column_name]), len(row2[column_name]))
91
- similarity_score = 1 - (distance / max_length)
92
- if similarity_score >= threshold:
 
 
93
  similar_texts.append((i, j, row1[column_name], row2[column_name]))
94
 
95
- return similar_texts
96
 
97
 
98
  def main():
@@ -130,13 +130,21 @@ def main():
130
  st.write(exact_matches)
131
 
132
 
 
 
 
 
 
 
 
 
 
133
  st.header("Similar (but Not Same) Texts")
134
  for text_pair in similar_texts:
135
  st.write(f"Row {text_pair[0]} in warehouse item stocks is similar to Row {text_pair[1]} in industry item stocks:")
136
  st.write(f"Warehouse: {text_pair[2]}")
137
  st.write(f"Industry: {text_pair[3]}")
138
- st.write
139
-
140
 
141
 
142
  if __name__ == "__main__":
 
59
  def find_exact_matches(df1, df2, column_name):
60
  # Find rows with exact matches in the specified column
61
  matches = pd.merge(df1, df2, on=column_name, how='inner')
62
+ return matches
63
 
64
 
65
+ def find_similar_texts(df1, df2, column_name, threshold=0.3):
66
+ # Find rows with similar texts in the specified column, including exact matches
 
 
67
  similar_texts = []
68
+ exact_matches = []
69
+
70
  # Concatenate texts from both dataframes
71
  all_texts = df1[column_name].astype(str).tolist() + df2[column_name].astype(str).tolist()
72
 
 
80
  # Iterate over pairs of rows to find similar texts
81
  for i, row1 in df1.iterrows():
82
  for j, row2 in df2.iterrows():
83
+ similarity = similarity_matrix[i, len(df1) + j]
84
+ if similarity >= threshold:
85
+ # Calculate Levenshtein distance between strings
86
+ distance = levenshtein_distance(row1[column_name], row2[column_name])
87
+ max_length = max(len(row1[column_name]), len(row2[column_name]))
88
+ similarity_score = 1 - (distance / max_length)
89
+ if similarity_score >= threshold:
90
+ if similarity == 1: # Exact match
91
+ exact_matches.append((i, j, row1[column_name], row2[column_name]))
92
+ else:
93
  similar_texts.append((i, j, row1[column_name], row2[column_name]))
94
 
95
+ return similar_texts, exact_matches
96
 
97
 
98
  def main():
 
130
  st.write(exact_matches)
131
 
132
 
133
+ # Display exact matches
134
+ st.header("Exact Matches Compare")
135
+ for match in exact_matches:
136
+ st.write(f"Row {match[0]} in warehouse item stocks is exactly the same as Row {match[1]} in industry item stocks:")
137
+ st.write(f"Warehouse: {match[2]}")
138
+ st.write(f"Industry: {match[3]}")
139
+ st.write()
140
+
141
+ # Display similar texts
142
  st.header("Similar (but Not Same) Texts")
143
  for text_pair in similar_texts:
144
  st.write(f"Row {text_pair[0]} in warehouse item stocks is similar to Row {text_pair[1]} in industry item stocks:")
145
  st.write(f"Warehouse: {text_pair[2]}")
146
  st.write(f"Industry: {text_pair[3]}")
147
+ st.write()
 
148
 
149
 
150
  if __name__ == "__main__":