Mattral commited on
Commit
e2e62c4
·
verified ·
1 Parent(s): 3c68af2

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +0 -38
app.py CHANGED
@@ -63,36 +63,6 @@ def find_exact_matches(df1, df2, column_name):
63
 
64
 
65
 
66
- def find_similar_texts2(df1, df2, column_name, exact_matches, threshold=0.3):
67
- # Find rows with similar texts in the specified column, excluding exact matches
68
- similar_texts = []
69
- exact_match_indices = set(exact_matches.index.tolist())
70
-
71
- # Concatenate texts from both dataframes
72
- all_texts = df1[column_name].astype(str).tolist() + df2[column_name].astype(str).tolist()
73
-
74
- # Compute TF-IDF vectors
75
- vectorizer = TfidfVectorizer()
76
- tfidf_matrix = vectorizer.fit_transform(all_texts)
77
-
78
- # Compute cosine similarity matrix
79
- similarity_matrix = cosine_similarity(tfidf_matrix, tfidf_matrix)
80
-
81
- # Iterate over pairs of rows to find similar texts
82
- for i, row1 in df1.iterrows():
83
- for j, row2 in df2.iterrows():
84
- if i not in exact_match_indices and j not in exact_match_indices:
85
- similarity = similarity_matrix[i, len(df1) + j]
86
- if similarity =1: # EXact matches
87
- # Calculate Levenshtein distance between strings
88
- distance = levenshtein_distance(row1[column_name], row2[column_name])
89
- max_length = max(len(row1[column_name]), len(row2[column_name]))
90
- similarity_score = 1 - (distance / max_length)
91
- if similarity_score >= threshold:
92
- similar_texts.append((i, j, row1[column_name], row2[column_name]))
93
-
94
- return similar_texts2
95
-
96
 
97
  def find_similar_texts(df1, df2, column_name, exact_matches, threshold=0.3):
98
  # Find rows with similar texts in the specified column, excluding exact matches
@@ -142,7 +112,6 @@ def main():
142
  warehouse_columns = warehouse_df.columns.tolist()
143
  industry_columns = industry_df.columns.tolist()
144
 
145
-
146
  # Select columns using dropdowns
147
  st.header("Select Columns")
148
  warehouse_column = st.selectbox("Choose column from warehouse item stocks:", warehouse_columns)
@@ -155,7 +124,6 @@ def main():
155
 
156
  # Find similar texts
157
  similar_texts = find_similar_texts(warehouse_df, industry_df, warehouse_column, exact_matches)
158
- similar_texts2 = find_similar_texts(warehouse_df, industry_df, warehouse_column, exact_matches)
159
 
160
  # Display results
161
  st.header("Exact Matches")
@@ -169,12 +137,6 @@ def main():
169
  st.write(f"Industry: {text_pair[3]}")
170
  st.write
171
 
172
- st.header("Exactly Same Texts")
173
- for text_pair in similar_texts2:
174
- st.write(f"Row {text_pair[0]} in warehouse item stocks is the same as Row {text_pair[1]} in industry item stocks:")
175
- st.write(f"Warehouse: {text_pair[2]}")
176
- st.write(f"Industry: {text_pair[3]}")
177
- st.write
178
 
179
 
180
  if __name__ == "__main__":
 
63
 
64
 
65
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
66
 
67
  def find_similar_texts(df1, df2, column_name, exact_matches, threshold=0.3):
68
  # Find rows with similar texts in the specified column, excluding exact matches
 
112
  warehouse_columns = warehouse_df.columns.tolist()
113
  industry_columns = industry_df.columns.tolist()
114
 
 
115
  # Select columns using dropdowns
116
  st.header("Select Columns")
117
  warehouse_column = st.selectbox("Choose column from warehouse item stocks:", warehouse_columns)
 
124
 
125
  # Find similar texts
126
  similar_texts = find_similar_texts(warehouse_df, industry_df, warehouse_column, exact_matches)
 
127
 
128
  # Display results
129
  st.header("Exact Matches")
 
137
  st.write(f"Industry: {text_pair[3]}")
138
  st.write
139
 
 
 
 
 
 
 
140
 
141
 
142
  if __name__ == "__main__":