Spaces:

GIZ
/

Development-Project-Synergy-Finder

Running on CPU Upgrade

App Files Files Community

Jan Mühlnikel commited on Mar 18

Commit

f123b98

•

1 Parent(s): fd7cbe7

added matching functionality and viz

Browse files

Files changed (7) hide show

__pycache__/similarity_page.cpython-310.pyc +0 -0
functions/__pycache__/calc_matches.cpython-310.pyc +0 -0
functions/__pycache__/filter_projects.cpython-310.pyc +0 -0
functions/calc_matches.py +31 -0
modules/__pycache__/result_table.cpython-310.pyc +0 -0
modules/result_table.py +106 -51
similarity_page.py +8 -41

__pycache__/similarity_page.cpython-310.pyc CHANGED Viewed

Binary files a/__pycache__/similarity_page.cpython-310.pyc and b/__pycache__/similarity_page.cpython-310.pyc differ

functions/__pycache__/calc_matches.cpython-310.pyc ADDED Viewed

Binary file (810 Bytes). View file

functions/__pycache__/filter_projects.cpython-310.pyc ADDED Viewed

Binary file (983 Bytes). View file

functions/calc_matches.py ADDED Viewed

	@@ -0,0 +1,31 @@

+import pandas as pd
+import numpy as np
+def calc_matches(filtered_df, project_df, similarity_matrix):
+    # matching project2 can be nay project
+    # indecies (rows) = project1
+    # columns = project2
+    # -> find matches
+    # filter out all row considering the filter
+    filtered_df_indecies_list = filtered_df.index
+    np.fill_diagonal(similarity_matrix, 0)
+    match_matrix = similarity_matrix[filtered_df_indecies_list]
+    # get row (project1) and column (project2) with highest similarity in filtered df
+    top_indices = np.unravel_index(np.argsort(match_matrix, axis=None)[-60:], match_matrix.shape)
+    # get the corresponding similarity values
+    top_values = match_matrix[top_indices]
+    p1_df = filtered_df.iloc[top_indices[0]]
+    p1_df["similarity"] = top_values
+    p2_df = project_df.iloc[top_indices[1]]
+    p2_df["similarity"] = top_values
+    return p1_df, p2_df

modules/__pycache__/result_table.cpython-310.pyc CHANGED Viewed

Binary files a/modules/__pycache__/result_table.cpython-310.pyc and b/modules/__pycache__/result_table.cpython-310.pyc differ

modules/result_table.py CHANGED Viewed

@@ -1,53 +1,108 @@
 import streamlit as st
-def show_table(data_df, similarities:list):
-    st.write("------------------")
-    st.dataframe(
-        data_df[["title_main", "orga_abbreviation", "client", "description_main", "country", "sgd_pred_code", "crs_3_code", "crs_5_code", "similarity"]],
-        use_container_width = True,
-        height = 35 + 35 * len(data_df),
-        column_config={
-            "orga_abbreviation": st.column_config.TextColumn(
-                "Organization",
-                help="If description not in English, description in other language provided",
-                disabled=True
-            ),
-            "client": st.column_config.TextColumn(
-                "Client",
-                help="Client organization of customer",
-                disabled=True
-            ),
-            "title_main": st.column_config.TextColumn(
-                "Title",
-                help="If title not in English, title in other language provided",
-                disabled=True
-            ),
-            "description_main": st.column_config.TextColumn(
-                "Description",
-                help="If description not in English, description in other language provided",
-                disabled=True
-            ),
-            "country": st.column_config.TextColumn(
-                "Country",
-                help="Country of project",
-                disabled=True
-            ),
-            "sgd_pred_code": st.column_config.TextColumn(
-                "SDG Prediction",
-                help="Prediction of SDG's",
-                disabled=True
-            ),
-            "crs_3_code": st.column_config.TextColumn(
-                "CRS 3",
-                help="CRS 3 code given by organization",
-                disabled=True
-            ),
-            "crs_5_code": st.column_config.TextColumn(
-                "CRS 5",
-                help="CRS 5 code given by organization",
-                disabled=True
-            ),
-        },
-        hide_index=True,
-    )

 import streamlit as st
+def show_table(p1_df, p2_df):
+    col1, col2 = st.columns([1, 1])
+    with col1:
+        st.write("------------------")
+        st.dataframe(
+            p1_df[["title_main", "orga_abbreviation", "client", "description_main", "country", "sgd_pred_code", "crs_3_code", "crs_5_code", "similarity"]],
+            use_container_width = True,
+            height = 35 + 35 * len(p1_df),
+            column_config={
+                "orga_abbreviation": st.column_config.TextColumn(
+                    "Organization",
+                    help="If description not in English, description in other language provided",
+                    disabled=True
+                ),
+                "client": st.column_config.TextColumn(
+                    "Client",
+                    help="Client organization of customer",
+                    disabled=True
+                ),
+                "title_main": st.column_config.TextColumn(
+                    "Title",
+                    help="If title not in English, title in other language provided",
+                    disabled=True
+                ),
+                "description_main": st.column_config.TextColumn(
+                    "Description",
+                    help="If description not in English, description in other language provided",
+                    disabled=True
+                ),
+                "country": st.column_config.TextColumn(
+                    "Country",
+                    help="Country of project",
+                    disabled=True
+                ),
+                "sgd_pred_code": st.column_config.TextColumn(
+                    "SDG Prediction",
+                    help="Prediction of SDG's",
+                    disabled=True
+                ),
+                "crs_3_code": st.column_config.TextColumn(
+                    "CRS 3",
+                    help="CRS 3 code given by organization",
+                    disabled=True
+                ),
+                "crs_5_code": st.column_config.TextColumn(
+                    "CRS 5",
+                    help="CRS 5 code given by organization",
+                    disabled=True
+                ),
+            },
+            hide_index=True,
+        )
+    with col2:
+        st.write("------------------")
+        st.dataframe(
+            p2_df[["title_main", "orga_abbreviation", "client", "description_main", "country", "sgd_pred_code", "crs_3_code", "crs_5_code", "similarity"]],
+            use_container_width = True,
+            height = 35 + 35 * len(p2_df),
+            column_config={
+                "orga_abbreviation": st.column_config.TextColumn(
+                    "Organization",
+                    help="If description not in English, description in other language provided",
+                    disabled=True
+                ),
+                "client": st.column_config.TextColumn(
+                    "Client",
+                    help="Client organization of customer",
+                    disabled=True
+                ),
+                "title_main": st.column_config.TextColumn(
+                    "Title",
+                    help="If title not in English, title in other language provided",
+                    disabled=True
+                ),
+                "description_main": st.column_config.TextColumn(
+                    "Description",
+                    help="If description not in English, description in other language provided",
+                    disabled=True
+                ),
+                "country": st.column_config.TextColumn(
+                    "Country",
+                    help="Country of project",
+                    disabled=True
+                ),
+                "sgd_pred_code": st.column_config.TextColumn(
+                    "SDG Prediction",
+                    help="Prediction of SDG's",
+                    disabled=True
+                ),
+                "crs_3_code": st.column_config.TextColumn(
+                    "CRS 3",
+                    help="CRS 3 code given by organization",
+                    disabled=True
+                ),
+                "crs_5_code": st.column_config.TextColumn(
+                    "CRS 5",
+                    help="CRS 5 code given by organization",
+                    disabled=True
+                ),
+            },
+            hide_index=True,
+        )

similarity_page.py CHANGED Viewed

@@ -11,9 +11,10 @@ from scipy.sparse import load_npz
 import pickle
 import faiss
 from sentence_transformers import SentenceTransformer
-import modules.result_table as result_table
 import modules.semantic_search as semantic_search
 from functions.filter_projects import filter_projects
 import psutil
 import os
@@ -131,45 +132,11 @@ def show_page():
     # CRS CODE LIST
     crs3_list = [i[-3:] for i in crs3_option]
-    st.write(crs3_list)
-    result_df = filter_projects(projects_df, crs3_list)
-    st.dataframe(result_df)
-    """
-    #semantic_search.show_search(model, faiss_index, sentences)
-    df_subset = projects_df.head(10)
-    selected_index = st.selectbox('Select an entry', df_subset.index, format_func=lambda x: df_subset.loc[x, 'iati_id'])
-    st.write(selected_index)
-    # add index and similarity together
-    indecies = range(0, len(sim_matrix))
-    similarities = sim_matrix[selected_index]
-    zipped_sims = list(zip(indecies, similarities))
-    # remove all 0 similarities
-    filtered_sims = [(index, similarity) for index, similarity in zipped_sims if similarity != 0]
-    # Select and sort top 20 most similar projects
-    sorted_sims = sorted(filtered_sims, key=lambda x: x[1], reverse=True)
-    top_20_sims = sorted_sims[:20]
-    # create result data frame
-    index_list = [tup[0] for tup in top_20_sims]
-    print(index_list)
-    result_df = projects_df.iloc[index_list]
-    print(len(result_df))
-    print(len(result_df))
-    # add other colums to result df
-    similarity_list = [tup[1] for tup in top_20_sims]
-    result_df["similarity"] = similarity_list
-    similarity_table.show_table(result_df, similarity_list)
-    """

 import pickle
 import faiss
 from sentence_transformers import SentenceTransformer
+from modules.result_table import show_table
 import modules.semantic_search as semantic_search
 from functions.filter_projects import filter_projects
+from functions.calc_matches import calc_matches
 import psutil
 import os
     # CRS CODE LIST
     crs3_list = [i[-3:] for i in crs3_option]
+    # FILTER DF WITH SELECTED FILTER OPTIONS
+    filtered_df = filter_projects(projects_df, crs3_list)
+    # FIND MATCHES
+    p1_df, p2_df = calc_matches(filtered_df, projects_df, sim_matrix)
+    # SHOW THE RESULT
+    show_table(p1_df, p2_df)