Jan Mühlnikel commited on
Commit
f123b98
1 Parent(s): fd7cbe7

added matching functionality and viz

Browse files
__pycache__/similarity_page.cpython-310.pyc CHANGED
Binary files a/__pycache__/similarity_page.cpython-310.pyc and b/__pycache__/similarity_page.cpython-310.pyc differ
 
functions/__pycache__/calc_matches.cpython-310.pyc ADDED
Binary file (810 Bytes). View file
 
functions/__pycache__/filter_projects.cpython-310.pyc ADDED
Binary file (983 Bytes). View file
 
functions/calc_matches.py ADDED
@@ -0,0 +1,31 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pandas as pd
2
+ import numpy as np
3
+
4
+ def calc_matches(filtered_df, project_df, similarity_matrix):
5
+ # matching project2 can be nay project
6
+ # indecies (rows) = project1
7
+ # columns = project2
8
+ # -> find matches
9
+
10
+ # filter out all row considering the filter
11
+ filtered_df_indecies_list = filtered_df.index
12
+
13
+ np.fill_diagonal(similarity_matrix, 0)
14
+ match_matrix = similarity_matrix[filtered_df_indecies_list]
15
+
16
+ # get row (project1) and column (project2) with highest similarity in filtered df
17
+ top_indices = np.unravel_index(np.argsort(match_matrix, axis=None)[-60:], match_matrix.shape)
18
+
19
+ # get the corresponding similarity values
20
+ top_values = match_matrix[top_indices]
21
+
22
+ p1_df = filtered_df.iloc[top_indices[0]]
23
+ p1_df["similarity"] = top_values
24
+ p2_df = project_df.iloc[top_indices[1]]
25
+ p2_df["similarity"] = top_values
26
+
27
+ return p1_df, p2_df
28
+
29
+
30
+
31
+
modules/__pycache__/result_table.cpython-310.pyc CHANGED
Binary files a/modules/__pycache__/result_table.cpython-310.pyc and b/modules/__pycache__/result_table.cpython-310.pyc differ
 
modules/result_table.py CHANGED
@@ -1,53 +1,108 @@
1
  import streamlit as st
2
 
3
- def show_table(data_df, similarities:list):
4
- st.write("------------------")
5
-
6
- st.dataframe(
7
- data_df[["title_main", "orga_abbreviation", "client", "description_main", "country", "sgd_pred_code", "crs_3_code", "crs_5_code", "similarity"]],
8
- use_container_width = True,
9
- height = 35 + 35 * len(data_df),
10
- column_config={
11
- "orga_abbreviation": st.column_config.TextColumn(
12
- "Organization",
13
- help="If description not in English, description in other language provided",
14
- disabled=True
15
- ),
16
- "client": st.column_config.TextColumn(
17
- "Client",
18
- help="Client organization of customer",
19
- disabled=True
20
- ),
21
- "title_main": st.column_config.TextColumn(
22
- "Title",
23
- help="If title not in English, title in other language provided",
24
- disabled=True
25
- ),
26
- "description_main": st.column_config.TextColumn(
27
- "Description",
28
- help="If description not in English, description in other language provided",
29
- disabled=True
30
- ),
31
- "country": st.column_config.TextColumn(
32
- "Country",
33
- help="Country of project",
34
- disabled=True
35
- ),
36
- "sgd_pred_code": st.column_config.TextColumn(
37
- "SDG Prediction",
38
- help="Prediction of SDG's",
39
- disabled=True
40
- ),
41
- "crs_3_code": st.column_config.TextColumn(
42
- "CRS 3",
43
- help="CRS 3 code given by organization",
44
- disabled=True
45
- ),
46
- "crs_5_code": st.column_config.TextColumn(
47
- "CRS 5",
48
- help="CRS 5 code given by organization",
49
- disabled=True
50
- ),
51
- },
52
- hide_index=True,
53
- )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  import streamlit as st
2
 
3
+ def show_table(p1_df, p2_df):
4
+
5
+ col1, col2 = st.columns([1, 1])
6
+ with col1:
7
+ st.write("------------------")
8
+
9
+ st.dataframe(
10
+ p1_df[["title_main", "orga_abbreviation", "client", "description_main", "country", "sgd_pred_code", "crs_3_code", "crs_5_code", "similarity"]],
11
+ use_container_width = True,
12
+ height = 35 + 35 * len(p1_df),
13
+ column_config={
14
+ "orga_abbreviation": st.column_config.TextColumn(
15
+ "Organization",
16
+ help="If description not in English, description in other language provided",
17
+ disabled=True
18
+ ),
19
+ "client": st.column_config.TextColumn(
20
+ "Client",
21
+ help="Client organization of customer",
22
+ disabled=True
23
+ ),
24
+ "title_main": st.column_config.TextColumn(
25
+ "Title",
26
+ help="If title not in English, title in other language provided",
27
+ disabled=True
28
+ ),
29
+ "description_main": st.column_config.TextColumn(
30
+ "Description",
31
+ help="If description not in English, description in other language provided",
32
+ disabled=True
33
+ ),
34
+ "country": st.column_config.TextColumn(
35
+ "Country",
36
+ help="Country of project",
37
+ disabled=True
38
+ ),
39
+ "sgd_pred_code": st.column_config.TextColumn(
40
+ "SDG Prediction",
41
+ help="Prediction of SDG's",
42
+ disabled=True
43
+ ),
44
+ "crs_3_code": st.column_config.TextColumn(
45
+ "CRS 3",
46
+ help="CRS 3 code given by organization",
47
+ disabled=True
48
+ ),
49
+ "crs_5_code": st.column_config.TextColumn(
50
+ "CRS 5",
51
+ help="CRS 5 code given by organization",
52
+ disabled=True
53
+ ),
54
+ },
55
+ hide_index=True,
56
+ )
57
+
58
+ with col2:
59
+ st.write("------------------")
60
+
61
+ st.dataframe(
62
+ p2_df[["title_main", "orga_abbreviation", "client", "description_main", "country", "sgd_pred_code", "crs_3_code", "crs_5_code", "similarity"]],
63
+ use_container_width = True,
64
+ height = 35 + 35 * len(p2_df),
65
+ column_config={
66
+ "orga_abbreviation": st.column_config.TextColumn(
67
+ "Organization",
68
+ help="If description not in English, description in other language provided",
69
+ disabled=True
70
+ ),
71
+ "client": st.column_config.TextColumn(
72
+ "Client",
73
+ help="Client organization of customer",
74
+ disabled=True
75
+ ),
76
+ "title_main": st.column_config.TextColumn(
77
+ "Title",
78
+ help="If title not in English, title in other language provided",
79
+ disabled=True
80
+ ),
81
+ "description_main": st.column_config.TextColumn(
82
+ "Description",
83
+ help="If description not in English, description in other language provided",
84
+ disabled=True
85
+ ),
86
+ "country": st.column_config.TextColumn(
87
+ "Country",
88
+ help="Country of project",
89
+ disabled=True
90
+ ),
91
+ "sgd_pred_code": st.column_config.TextColumn(
92
+ "SDG Prediction",
93
+ help="Prediction of SDG's",
94
+ disabled=True
95
+ ),
96
+ "crs_3_code": st.column_config.TextColumn(
97
+ "CRS 3",
98
+ help="CRS 3 code given by organization",
99
+ disabled=True
100
+ ),
101
+ "crs_5_code": st.column_config.TextColumn(
102
+ "CRS 5",
103
+ help="CRS 5 code given by organization",
104
+ disabled=True
105
+ ),
106
+ },
107
+ hide_index=True,
108
+ )
similarity_page.py CHANGED
@@ -11,9 +11,10 @@ from scipy.sparse import load_npz
11
  import pickle
12
  import faiss
13
  from sentence_transformers import SentenceTransformer
14
- import modules.result_table as result_table
15
  import modules.semantic_search as semantic_search
16
  from functions.filter_projects import filter_projects
 
17
  import psutil
18
  import os
19
 
@@ -131,45 +132,11 @@ def show_page():
131
  # CRS CODE LIST
132
  crs3_list = [i[-3:] for i in crs3_option]
133
 
134
- st.write(crs3_list)
 
135
 
136
- result_df = filter_projects(projects_df, crs3_list)
137
- st.dataframe(result_df)
138
 
139
-
140
-
141
- """
142
- #semantic_search.show_search(model, faiss_index, sentences)
143
-
144
- df_subset = projects_df.head(10)
145
- selected_index = st.selectbox('Select an entry', df_subset.index, format_func=lambda x: df_subset.loc[x, 'iati_id'])
146
-
147
- st.write(selected_index)
148
-
149
- # add index and similarity together
150
- indecies = range(0, len(sim_matrix))
151
- similarities = sim_matrix[selected_index]
152
- zipped_sims = list(zip(indecies, similarities))
153
-
154
- # remove all 0 similarities
155
- filtered_sims = [(index, similarity) for index, similarity in zipped_sims if similarity != 0]
156
-
157
- # Select and sort top 20 most similar projects
158
- sorted_sims = sorted(filtered_sims, key=lambda x: x[1], reverse=True)
159
- top_20_sims = sorted_sims[:20]
160
-
161
- # create result data frame
162
- index_list = [tup[0] for tup in top_20_sims]
163
- print(index_list)
164
- result_df = projects_df.iloc[index_list]
165
- print(len(result_df))
166
-
167
- print(len(result_df))
168
- # add other colums to result df
169
-
170
- similarity_list = [tup[1] for tup in top_20_sims]
171
- result_df["similarity"] = similarity_list
172
-
173
- similarity_table.show_table(result_df, similarity_list)
174
-
175
- """
 
11
  import pickle
12
  import faiss
13
  from sentence_transformers import SentenceTransformer
14
+ from modules.result_table import show_table
15
  import modules.semantic_search as semantic_search
16
  from functions.filter_projects import filter_projects
17
+ from functions.calc_matches import calc_matches
18
  import psutil
19
  import os
20
 
 
132
  # CRS CODE LIST
133
  crs3_list = [i[-3:] for i in crs3_option]
134
 
135
+ # FILTER DF WITH SELECTED FILTER OPTIONS
136
+ filtered_df = filter_projects(projects_df, crs3_list)
137
 
138
+ # FIND MATCHES
139
+ p1_df, p2_df = calc_matches(filtered_df, projects_df, sim_matrix)
140
 
141
+ # SHOW THE RESULT
142
+ show_table(p1_df, p2_df)