Jan Mühlnikel
added crs filter
d551fc8
raw
history blame
4.89 kB
"""
Page for similarities
"""
################
# DEPENDENCIES #
################
import streamlit as st
import pandas as pd
from scipy.sparse import load_npz
import pickle
import faiss
from sentence_transformers import SentenceTransformer
import modules.result_table as result_table
import modules.semantic_search as semantic_search
from functions.filter_projects import filter_projects
import psutil
import os
def get_process_memory():
process = psutil.Process(os.getpid())
return process.memory_info().rss / (1024 * 1024)
# Catch DATA
# Load Similarity matrix
@st.cache_data
def load_sim_matrix():
loaded_matrix = load_npz("src/similarities.npz")
dense_matrix = loaded_matrix.toarray()
return dense_matrix
# Load Projects DFs
@st.cache_data
def load_projects():
orgas_df = pd.read_csv("src/projects/project_orgas.csv")
region_df = pd.read_csv("src/projects/project_region.csv")
sector_df = pd.read_csv("src/projects/project_sector.csv")
status_df = pd.read_csv("src/projects/project_status.csv")
texts_df = pd.read_csv("src/projects/project_texts.csv")
projects_df = pd.merge(orgas_df, region_df, on='iati_id', how='inner')
projects_df = pd.merge(projects_df, sector_df, on='iati_id', how='inner')
projects_df = pd.merge(projects_df, status_df, on='iati_id', how='inner')
projects_df = pd.merge(projects_df, texts_df, on='iati_id', how='inner')
return projects_df
# Load CRS 3 data
@st.cache_data
def getCRS3():
# Read in CRS3 CODELISTS
crs3_df = pd.read_csv('src/codelists/crs3_codes.csv')
CRS3_CODES = crs3_df['code'].tolist()
CRS3_NAME = crs3_df['name'].tolist()
CRS3_MERGED = {f"{name} - {code}": code for name, code in zip(CRS3_NAME, CRS3_CODES)}
return CRS3_MERGED
# Load CRS 5 data
@st.cache_data
def getCRS5():
# Read in CRS3 CODELISTS
crs5_df = pd.read_csv('src/codelists/crs5_codes.csv')
CRS5_CODES = crs5_df['code'].tolist()
CRS5_NAME = crs5_df['name'].tolist()
CRS5_MERGED = {code: [f"{name} - {code}"] for name, code in zip(CRS5_NAME, CRS5_CODES)}
return CRS5_MERGED
# Load SDG data
@st.cache_data
def getSDG():
# Read in SDG CODELISTS
sdg_df = pd.read_csv('src/codelists/sdg_goals.csv')
SDG_NAMES = sdg_df['name'].tolist()
return SDG_NAMES
# Load Sentence Transformer Model
@st.cache_resource
def load_model():
model = SentenceTransformer('all-MiniLM-L6-v2')
return model
# Load Embeddings
@st.cache_data
def load_embeddings_and_index():
# Load embeddings
with open("src/embeddings.pkl", "rb") as fIn:
stored_data = pickle.load(fIn)
sentences = stored_data["sentences"]
embeddings = stored_data["embeddings"]
# Load or create FAISS index
dimension = embeddings.shape[1]
faiss_index = faiss.IndexFlatL2(dimension)
faiss_index.add(embeddings)
return sentences, embeddings, faiss_index
# USE CACHE FUNCTIONS
sim_matrix = load_sim_matrix()
projects_df = load_projects()
CRS3_MERGED = getCRS3()
CRS5_MERGED = getCRS5()
SDG_NAMES = getSDG()
model = load_model()
sentences, embeddings, faiss_index = load_embeddings_and_index()
def show_page():
st.write(f"Current RAM usage of this app: {get_process_memory():.2f} MB")
st.write("Similarities")
col1, col2 = st.columns([1, 1])
with col1:
# CRS 3 SELECTION
crs3_option = st.multiselect(
'CRS 3',
CRS3_MERGED,
placeholder="Select"
)
with col2:
st.write("x")
# CRS CODE LIST
crs3_list = [i[-3:] for i in crs3_option]
st.write(crs3_list)
result_df = filter_projects(projects_df, crs3_list)
st.dataframe(result_df)
"""
#semantic_search.show_search(model, faiss_index, sentences)
df_subset = projects_df.head(10)
selected_index = st.selectbox('Select an entry', df_subset.index, format_func=lambda x: df_subset.loc[x, 'iati_id'])
st.write(selected_index)
# add index and similarity together
indecies = range(0, len(sim_matrix))
similarities = sim_matrix[selected_index]
zipped_sims = list(zip(indecies, similarities))
# remove all 0 similarities
filtered_sims = [(index, similarity) for index, similarity in zipped_sims if similarity != 0]
# Select and sort top 20 most similar projects
sorted_sims = sorted(filtered_sims, key=lambda x: x[1], reverse=True)
top_20_sims = sorted_sims[:20]
# create result data frame
index_list = [tup[0] for tup in top_20_sims]
print(index_list)
result_df = projects_df.iloc[index_list]
print(len(result_df))
print(len(result_df))
# add other colums to result df
similarity_list = [tup[1] for tup in top_20_sims]
result_df["similarity"] = similarity_list
similarity_table.show_table(result_df, similarity_list)
"""