Mattral commited on
Commit
eca35fa
·
verified ·
1 Parent(s): d5b07e0

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +41 -1
app.py CHANGED
@@ -59,7 +59,39 @@ def read_csv_or_excel(file):
59
  def find_exact_matches(df1, df2, column_name):
60
  # Find rows with exact matches in the specified column
61
  matches = pd.merge(df1, df2, on=column_name, how='inner')
62
- return matches
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
63
 
64
 
65
  def find_similar_texts(df1, df2, column_name, exact_matches, threshold=0.3):
@@ -123,6 +155,7 @@ def main():
123
 
124
  # Find similar texts
125
  similar_texts = find_similar_texts(warehouse_df, industry_df, warehouse_column, exact_matches)
 
126
 
127
  # Display results
128
  st.header("Exact Matches")
@@ -136,6 +169,13 @@ def main():
136
  st.write(f"Industry: {text_pair[3]}")
137
  st.write
138
 
 
 
 
 
 
 
 
139
 
140
  if __name__ == "__main__":
141
  main()
 
59
  def find_exact_matches(df1, df2, column_name):
60
  # Find rows with exact matches in the specified column
61
  matches = pd.merge(df1, df2, on=column_name, how='inner')
62
+ return
63
+
64
+
65
+
66
+ def find_similar_texts2(df1, df2, column_name, exact_matches, threshold=0.3):
67
+ # Find rows with similar texts in the specified column, excluding exact matches
68
+ similar_texts = []
69
+ exact_match_indices = set(exact_matches.index.tolist())
70
+
71
+ # Concatenate texts from both dataframes
72
+ all_texts = df1[column_name].astype(str).tolist() + df2[column_name].astype(str).tolist()
73
+
74
+ # Compute TF-IDF vectors
75
+ vectorizer = TfidfVectorizer()
76
+ tfidf_matrix = vectorizer.fit_transform(all_texts)
77
+
78
+ # Compute cosine similarity matrix
79
+ similarity_matrix = cosine_similarity(tfidf_matrix, tfidf_matrix)
80
+
81
+ # Iterate over pairs of rows to find similar texts
82
+ for i, row1 in df1.iterrows():
83
+ for j, row2 in df2.iterrows():
84
+ if i not in exact_match_indices and j not in exact_match_indices:
85
+ similarity = similarity_matrix[i, len(df1) + j]
86
+ if similarity = 1: # Exclude exact matches
87
+ # Calculate Levenshtein distance between strings
88
+ distance = levenshtein_distance(row1[column_name], row2[column_name])
89
+ max_length = max(len(row1[column_name]), len(row2[column_name]))
90
+ similarity_score = 1 - (distance / max_length)
91
+ if similarity_score >= threshold:
92
+ similar_texts.append((i, j, row1[column_name], row2[column_name]))
93
+
94
+ return similar_texts2
95
 
96
 
97
  def find_similar_texts(df1, df2, column_name, exact_matches, threshold=0.3):
 
155
 
156
  # Find similar texts
157
  similar_texts = find_similar_texts(warehouse_df, industry_df, warehouse_column, exact_matches)
158
+ similar_texts2 = find_similar_texts(warehouse_df, industry_df, warehouse_column, exact_matches)
159
 
160
  # Display results
161
  st.header("Exact Matches")
 
169
  st.write(f"Industry: {text_pair[3]}")
170
  st.write
171
 
172
+ st.header("Exactly Same Texts")
173
+ for text_pair in similar_texts2:
174
+ st.write(f"Row {text_pair[0]} in warehouse item stocks is the same as Row {text_pair[1]} in industry item stocks:")
175
+ st.write(f"Warehouse: {text_pair[2]}")
176
+ st.write(f"Industry: {text_pair[3]}")
177
+ st.write
178
+
179
 
180
  if __name__ == "__main__":
181
  main()