Spaces:

GIZ
/

Development-Project-Synergy-Finder

Running on CPU Upgrade

App Files Files Community

Jan Mühlnikel commited on May 26

Commit

7d8805d

•

1 Parent(s): 09c16ce

enhanced documentation

Browse files

Files changed (2) hide show

functions/{calc_matches.py → multi_project_matching.py} +32 -12
similarity_page.py +4 -19

functions/{calc_matches.py → multi_project_matching.py} RENAMED Viewed

@@ -1,24 +1,35 @@
-import pandas as pd
 import numpy as np
-from scipy.sparse import csr_matrix, coo_matrix
-import streamlit as st
-# multi_project_matching
-def calc_matches(filtered_df, project_df, similarity_matrix, top_x):
-    # Ensure the matrix is in a suitable format for manipulation
     if not isinstance(similarity_matrix, csr_matrix):
         similarity_matrix = csr_matrix(similarity_matrix)
     filtered_indices = filtered_df.index.to_list()
     project_indices = project_df.index.to_list()
     match_matrix = similarity_matrix[project_indices, :][:, filtered_indices] # row / column
     dense_match_matrix = match_matrix.toarray()
     flat_matrix = dense_match_matrix.flatten()
-    # Get the indices of the top 15 values in the flattened matrix
     top_15_indices = np.argsort(flat_matrix)[-top_x:]
     # Convert flat indices back to 2D indices
@@ -28,7 +39,6 @@ def calc_matches(filtered_df, project_df, similarity_matrix, top_x):
     top_15_values = flat_matrix[top_15_indices]
     # Prepare the result with row and column indices from original dataframes
-    top_15_matches = []
     org_rows = []
     org_cols = []
     for value, row, col in zip(top_15_values, top_15_2d_indices[0], top_15_2d_indices[1]):
@@ -36,14 +46,24 @@ def calc_matches(filtered_df, project_df, similarity_matrix, top_x):
         original_col_index = filtered_indices[col]
         org_rows.append(original_row_index)
         org_cols.append(original_col_index)
-        top_15_matches.append((value, original_row_index, original_col_index))
     p1_df = filtered_df.loc[org_cols].copy()
     p1_df['similarity'] = top_15_values
     p2_df = project_df.loc[org_rows].copy()
     p2_df['similarity'] = top_15_values
-    print("finished calc matches")
     return p1_df, p2_df

 import numpy as np
+from scipy.sparse import csr_matrix
+"""
+Function to calculate the multi project matching results
+The Multi-Project Matching Feature uncovers synergy opportunities among various development banks and organizations by facilitating the search for similar projects
+within a selected filter setting (filtered_df) and all projects (project_df).
+"""
+def calc_multi_matches(filtered_df, project_df, similarity_matrix, top_x):
+    """
+    filtered_df: df with applied filters
+    project_df: df with all projects
+    similarity_matrix: np sparse matrix with all similarities between projects
+    top_x: top x project which should be displayed
+    """
+    # convert npz sparse matrix into csr matrix
     if not isinstance(similarity_matrix, csr_matrix):
         similarity_matrix = csr_matrix(similarity_matrix)
+    # extract indecies of the projects
     filtered_indices = filtered_df.index.to_list()
     project_indices = project_df.index.to_list()
+    # size down the matrix to only projects within the filter and convert to dense matrix and flatten it
     match_matrix = similarity_matrix[project_indices, :][:, filtered_indices] # row / column
     dense_match_matrix = match_matrix.toarray()
     flat_matrix = dense_match_matrix.flatten()
+    # get the indices of the top 15 values in the flattened matrix
     top_15_indices = np.argsort(flat_matrix)[-top_x:]
     # Convert flat indices back to 2D indices
     top_15_values = flat_matrix[top_15_indices]
     # Prepare the result with row and column indices from original dataframes
     org_rows = []
     org_cols = []
     for value, row, col in zip(top_15_values, top_15_2d_indices[0], top_15_2d_indices[1]):
         original_col_index = filtered_indices[col]
         org_rows.append(original_row_index)
         org_cols.append(original_col_index)
+    # create two result dataframes
+    """
+    p1_df: first results of match
+    p2_df: matching result
+    matches are displayed through the indecies od p1 and p2 dfs
+    match1 p1_df.iloc[0] & p2_df.iloc[0]
+    match2 p1_df.iloc[1] & p2_df.iloc[1]
+    """
     p1_df = filtered_df.loc[org_cols].copy()
     p1_df['similarity'] = top_15_values
     p2_df = project_df.loc[org_rows].copy()
     p2_df['similarity'] = top_15_values
+    # return both results df with amtching projects
     return p1_df, p2_df

similarity_page.py CHANGED Viewed

@@ -14,7 +14,7 @@ from modules.multimatch_result_table import show_multi_table
 from modules.singlematch_result_table import show_single_table
 from functions.filter_projects import filter_projects
 from functions.filter_single import filter_single
-from functions.calc_matches import calc_matches
 from functions.same_country_filter import same_country_filter
 from functions.single_similar import find_similar
 #import psutil
@@ -30,29 +30,14 @@ def get_process_memory():
 # Catch DATA
 # Load Similarity matrix
-"""
-@st.cache_data
-def load_sim_matrix():
-    loaded_matrix = load_npz("src/extended_similarities.npz")
-    dense_matrix = loaded_matrix.toarray().astype('float16')
-    return dense_matrix
-"""
 @st.cache_data
 def load_sim_matrix():
     loaded_matrix = load_npz("src/extended_similarities.npz")
     #dense_matrix = loaded_matrix.toarray().astype('float16')
     return loaded_matrix
-# Load Non Similar Orga Matrix
-"""
-@st.cache_data
-def load_nonsameorga_sim_matrix():
-    loaded_matrix = load_npz("src/extended_similarities_nonsimorga.npz")
-    dense_matrix = loaded_matrix.toarray().astype('float16')
-    return dense_matrix
-"""
 def load_nonsameorga_sim_matrix():
     loaded_matrix = load_npz("src/extended_similarities_nonsimorga.npz")
     #dense_matrix = loaded_matrix.toarray().astype('float16')
@@ -272,10 +257,10 @@ def show_multi_matching_page():
         ## if show only different orgas checkbox is activated
         if different_orga_checkbox:
             with st.spinner('Please wait...'):
-                p1_df, p2_df = calc_matches(filtered_df, compare_df, nonsameorgas_sim_matrix, TOP_X_PROJECTS)
         else:
             with st.spinner('Please wait...'):
-                p1_df, p2_df = calc_matches(filtered_df, compare_df, sim_matrix, TOP_X_PROJECTS)
         # SHOW THE RESULT
         show_multi_table(p1_df, p2_df)

 from modules.singlematch_result_table import show_single_table
 from functions.filter_projects import filter_projects
 from functions.filter_single import filter_single
+from functions.multi_project_matching import calc_multi_matches
 from functions.same_country_filter import same_country_filter
 from functions.single_similar import find_similar
 #import psutil
 # Catch DATA
 # Load Similarity matrix
 @st.cache_data
 def load_sim_matrix():
     loaded_matrix = load_npz("src/extended_similarities.npz")
     #dense_matrix = loaded_matrix.toarray().astype('float16')
     return loaded_matrix
+# Load Non Similar Orga Matrix
 def load_nonsameorga_sim_matrix():
     loaded_matrix = load_npz("src/extended_similarities_nonsimorga.npz")
     #dense_matrix = loaded_matrix.toarray().astype('float16')
         ## if show only different orgas checkbox is activated
         if different_orga_checkbox:
             with st.spinner('Please wait...'):
+                p1_df, p2_df = calc_multi_matches(filtered_df, compare_df, nonsameorgas_sim_matrix, TOP_X_PROJECTS)
         else:
             with st.spinner('Please wait...'):
+                p1_df, p2_df = calc_multi_matches(filtered_df, compare_df, sim_matrix, TOP_X_PROJECTS)
         # SHOW THE RESULT
         show_multi_table(p1_df, p2_df)