Jan Mühlnikel
added single matching result table
c8e0175
raw
history blame
9 kB
"""
Page for similarities
"""
################
# DEPENDENCIES #
################
import streamlit as st
import pandas as pd
from scipy.sparse import load_npz
import pickle
from sentence_transformers import SentenceTransformer
from modules.multimatch_result_table import show_multi_table
from modules.singlematch_result_table import show_single_table
from functions.filter_projects import filter_projects
from functions.calc_matches import calc_matches
from functions.same_country_filter import same_country_filter
from functions.single_similar import find_similar
import psutil
import os
import gc
def get_process_memory():
process = psutil.Process(os.getpid())
return process.memory_info().rss / (1024 * 1024)
# Catch DATA
# Load Similarity matrix
@st.cache_data
def load_sim_matrix():
loaded_matrix = load_npz("src/extended_similarities.npz")
dense_matrix = loaded_matrix.toarray()
return dense_matrix
# Load Non Similar Orga Matrix
@st.cache_data
def load_nonsameorga_sim_matrix():
loaded_matrix = load_npz("src/extended_similarities_nonsimorga.npz")
dense_matrix = loaded_matrix.toarray()
return dense_matrix
# Load Projects DFs
@st.cache_data
def load_projects():
orgas_df = pd.read_csv("src/projects/project_orgas.csv")
region_df = pd.read_csv("src/projects/project_region.csv")
sector_df = pd.read_csv("src/projects/project_sector.csv")
status_df = pd.read_csv("src/projects/project_status.csv")
texts_df = pd.read_csv("src/projects/project_texts.csv")
projects_df = pd.merge(orgas_df, region_df, on='iati_id', how='inner')
projects_df = pd.merge(projects_df, sector_df, on='iati_id', how='inner')
projects_df = pd.merge(projects_df, status_df, on='iati_id', how='inner')
projects_df = pd.merge(projects_df, texts_df, on='iati_id', how='inner')
iati_search_list = [f'{row.iati_id}' for row in projects_df.itertuples()]
title_search_list = [f'{row.title_main} ({row.orga_abbreviation.upper()})' for row in projects_df.itertuples()]
return projects_df, iati_search_list, title_search_list
# Load CRS 3 data
@st.cache_data
def getCRS3():
# Read in CRS3 CODELISTS
crs3_df = pd.read_csv('src/codelists/crs3_codes.csv')
CRS3_CODES = crs3_df['code'].tolist()
CRS3_NAME = crs3_df['name'].tolist()
CRS3_MERGED = {f"{name} - {code}": code for name, code in zip(CRS3_NAME, CRS3_CODES)}
return CRS3_MERGED
# Load CRS 5 data
@st.cache_data
def getCRS5():
# Read in CRS3 CODELISTS
crs5_df = pd.read_csv('src/codelists/crs5_codes.csv')
CRS5_CODES = crs5_df['code'].tolist()
CRS5_NAME = crs5_df['name'].tolist()
CRS5_MERGED = {code: [f"{name} - {code}"] for name, code in zip(CRS5_NAME, CRS5_CODES)}
return CRS5_MERGED
# Load SDG data
@st.cache_data
def getSDG():
# Read in SDG CODELISTS
sdg_df = pd.read_csv('src/codelists/sdg_goals.csv')
SDG_NAMES = sdg_df['name'].tolist()
return SDG_NAMES
# Load Country Data
@st.cache_data
def getCountry():
# Read in countries from codelist
country_df = pd.read_csv('src/codelists/country_codes_ISO3166-1alpha-2.csv')
COUNTRY_CODES = country_df['Alpha-2 code'].tolist()
COUNTRY_NAMES = country_df['Country'].tolist()
COUNTRY_OPTION_LIST = [f"{COUNTRY_NAMES[i]} ({COUNTRY_CODES[i][-3:-1].upper()})"for i in range(len(COUNTRY_NAMES))]
return COUNTRY_OPTION_LIST
# Load Sentence Transformer Model
@st.cache_resource
def load_model():
model = SentenceTransformer('all-MiniLM-L6-v2')
return model
# Load Embeddings
@st.cache_data
def load_embeddings_and_index():
# Load embeddings
with open("src/embeddings.pkl", "rb") as fIn:
stored_data = pickle.load(fIn)
embeddings = stored_data["embeddings"]
return embeddings
# USE CACHE FUNCTIONS
sim_matrix = load_sim_matrix()
nonsameorgas_sim_matrix = load_nonsameorga_sim_matrix()
projects_df, iati_search_list, title_search_list = load_projects()
CRS3_MERGED = getCRS3()
CRS5_MERGED = getCRS5()
SDG_NAMES = getSDG()
COUNTRY_OPTION_LIST = getCountry()
# LOAD MODEL FROM CACHE FO SEMANTIC SEARCH
model = load_model()
embeddings = load_embeddings_and_index()
def show_multi_matching_page():
st.write(f"Current RAM usage of this app: {get_process_memory():.2f} MB")
st.write("Similarities")
st.session_state.crs5_option_disabled = True
col1, col2 = st.columns([1, 1])
with col1:
# CRS 3 SELECTION
crs3_option = st.multiselect(
'CRS 3',
CRS3_MERGED,
placeholder="Select CRS3"
)
# CRS 5 SELECTION
## Only enable crs5 select field when crs3 code is selected
if crs3_option != []:
st.session_state.crs5_option_disabled = False
## define list of crs5 codes dependend on crs3 codes
crs5_list = [txt[0].replace('"', "") for crs3_item in crs3_option for code, txt in CRS5_MERGED.items() if str(code)[:3] == str(crs3_item)[-3:]]
## crs5 select field
crs5_option = st.multiselect(
'CRS 5',
crs5_list,
placeholder="Select CRS 5",
disabled=st.session_state.crs5_option_disabled
)
# SDG SELECTION
sdg_option = st.selectbox(
label = 'SDG',
index = None,
placeholder = "Select SDG",
options = SDG_NAMES[:-1],
)
different_orga_checkbox = st.checkbox("Only matches between different organizations")
filterd_country_only_checkbox = st.checkbox("Only matches between filtered countries")
with col2:
# COUNTRY SELECTION
country_option = st.multiselect(
'Country / Countries',
COUNTRY_OPTION_LIST,
placeholder="All"
)
# ORGA SELECTION
orga_abbreviation = projects_df["orga_abbreviation"].unique()
orga_full_names = projects_df["orga_full_name"].unique()
orga_list = [f"{orga_full_names[i]} ({orga_abbreviation[i].upper()})"for i in range(len(orga_abbreviation))]
orga_option = st.multiselect(
'Development Bank / Organization',
orga_list,
placeholder="All"
)
# SEARCH BOX
query = st.text_input("Enter your search query:")
# CRS CODE LIST
crs3_list = [i[-3:] for i in crs3_option]
crs5_list = [i[-5:] for i in crs5_option]
# SDG CODE LIST
if sdg_option != None:
sdg_str = sdg_option[0]
else:
sdg_str = ""
# COUNTRY CODES LIST
country_code_list = [option[-3:-1] for option in country_option]
# ORGANIZATION CODES LIST
orga_code_list = [option.split("(")[1][:-1].lower() for option in orga_option]
# FILTER DF WITH SELECTED FILTER OPTIONS
TOP_X_PROJECTS = 30
filtered_df = filter_projects(projects_df, crs3_list, crs5_list, sdg_str, country_code_list, orga_code_list, query, model, embeddings, TOP_X_PROJECTS)
#with col2:
# Semantic Search
#searched_filtered_df = semantic_search.show_search(model, embeddings, sentences, filtered_df, TOP_X_PROJECTS)
if isinstance(filtered_df, pd.DataFrame):
# FIND MATCHES
## If only same country checkbox i sactivated
if filterd_country_only_checkbox:
compare_df = same_country_filter(projects_df, country_code_list)
else:
compare_df = projects_df
## if show only different orgas checkbox is activated
if different_orga_checkbox:
p1_df, p2_df = calc_matches(filtered_df, compare_df, nonsameorgas_sim_matrix, TOP_X_PROJECTS)
else:
p1_df, p2_df = calc_matches(filtered_df, compare_df, sim_matrix, TOP_X_PROJECTS)
# SHOW THE RESULT
show_multi_table(p1_df, p2_df)
del p1_df, p2_df
else:
st.write("Select at least on CRS 3, SDG or type in a query")
del crs3_list, crs5_list, sdg_str, filtered_df
gc.collect()
def show_single_matching_page():
search_option = st.selectbox(
label = 'Select how you want to search',
index = 0,
placeholder = " ",
options = ["Search with IATI ID", "Search with project title"],
)
if search_option == "Search with IATI ID":
search_list = iati_search_list
else:
search_list = title_search_list
project_option = st.selectbox(
label = 'Search for a project with the IATI ID or teh project title',
index = None,
placeholder = " ",
options = search_list,
)
if project_option:
selected_index = search_list.index(project_option)
top_projects_df = find_similar(selected_index, sim_matrix, projects_df, 10)
show_single_table(top_projects_df)