Spaces:
Running
on
CPU Upgrade
Running
on
CPU Upgrade
""" | |
Page for similarities | |
""" | |
################ | |
# DEPENDENCIES # | |
################ | |
import streamlit as st | |
import pandas as pd | |
from scipy.sparse import load_npz | |
import pickle | |
import faiss | |
from sentence_transformers import SentenceTransformer | |
import modules.result_table as result_table | |
import modules.semantic_search as semantic_search | |
from functions.filter_projects import filter_projects | |
import psutil | |
import os | |
def get_process_memory(): | |
process = psutil.Process(os.getpid()) | |
return process.memory_info().rss / (1024 * 1024) | |
# Catch DATA | |
# Load Similarity matrix | |
def load_sim_matrix(): | |
loaded_matrix = load_npz("src/similarities.npz") | |
dense_matrix = loaded_matrix.toarray() | |
return dense_matrix | |
# Load Projects DFs | |
def load_projects(): | |
orgas_df = pd.read_csv("src/projects/project_orgas.csv") | |
region_df = pd.read_csv("src/projects/project_region.csv") | |
sector_df = pd.read_csv("src/projects/project_sector.csv") | |
status_df = pd.read_csv("src/projects/project_status.csv") | |
texts_df = pd.read_csv("src/projects/project_texts.csv") | |
projects_df = pd.merge(orgas_df, region_df, on='iati_id', how='inner') | |
projects_df = pd.merge(projects_df, sector_df, on='iati_id', how='inner') | |
projects_df = pd.merge(projects_df, status_df, on='iati_id', how='inner') | |
projects_df = pd.merge(projects_df, texts_df, on='iati_id', how='inner') | |
return projects_df | |
# Load CRS 3 data | |
def getCRS3(): | |
# Read in CRS3 CODELISTS | |
crs3_df = pd.read_csv('src/codelists/crs3_codes.csv') | |
CRS3_CODES = crs3_df['code'].tolist() | |
CRS3_NAME = crs3_df['name'].tolist() | |
CRS3_MERGED = {f"{name} - {code}": code for name, code in zip(CRS3_NAME, CRS3_CODES)} | |
return CRS3_MERGED | |
# Load CRS 5 data | |
def getCRS5(): | |
# Read in CRS3 CODELISTS | |
crs5_df = pd.read_csv('src/codelists/crs5_codes.csv') | |
CRS5_CODES = crs5_df['code'].tolist() | |
CRS5_NAME = crs5_df['name'].tolist() | |
CRS5_MERGED = {code: [f"{name} - {code}"] for name, code in zip(CRS5_NAME, CRS5_CODES)} | |
return CRS5_MERGED | |
# Load SDG data | |
def getSDG(): | |
# Read in SDG CODELISTS | |
sdg_df = pd.read_csv('src/codelists/sdg_goals.csv') | |
SDG_NAMES = sdg_df['name'].tolist() | |
return SDG_NAMES | |
# Load Sentence Transformer Model | |
def load_model(): | |
model = SentenceTransformer('all-MiniLM-L6-v2') | |
return model | |
# Load Embeddings | |
def load_embeddings_and_index(): | |
# Load embeddings | |
with open("src/embeddings.pkl", "rb") as fIn: | |
stored_data = pickle.load(fIn) | |
sentences = stored_data["sentences"] | |
embeddings = stored_data["embeddings"] | |
# Load or create FAISS index | |
dimension = embeddings.shape[1] | |
faiss_index = faiss.IndexFlatL2(dimension) | |
faiss_index.add(embeddings) | |
return sentences, embeddings, faiss_index | |
# USE CACHE FUNCTIONS | |
sim_matrix = load_sim_matrix() | |
projects_df = load_projects() | |
CRS3_MERGED = getCRS3() | |
CRS5_MERGED = getCRS5() | |
SDG_NAMES = getSDG() | |
model = load_model() | |
sentences, embeddings, faiss_index = load_embeddings_and_index() | |
def show_page(): | |
st.write(f"Current RAM usage of this app: {get_process_memory():.2f} MB") | |
st.write("Similarities") | |
col1, col2 = st.columns([1, 1]) | |
with col1: | |
# CRS 3 SELECTION | |
crs3_option = st.multiselect( | |
'CRS 3', | |
CRS3_MERGED, | |
placeholder="Select" | |
) | |
with col2: | |
st.write("x") | |
# CRS CODE LIST | |
crs3_list = [i[-3:] for i in crs3_option] | |
st.write(crs3_list) | |
result_df = filter_projects(projects_df, crs3_list) | |
st.dataframe(result_df) | |
""" | |
#semantic_search.show_search(model, faiss_index, sentences) | |
df_subset = projects_df.head(10) | |
selected_index = st.selectbox('Select an entry', df_subset.index, format_func=lambda x: df_subset.loc[x, 'iati_id']) | |
st.write(selected_index) | |
# add index and similarity together | |
indecies = range(0, len(sim_matrix)) | |
similarities = sim_matrix[selected_index] | |
zipped_sims = list(zip(indecies, similarities)) | |
# remove all 0 similarities | |
filtered_sims = [(index, similarity) for index, similarity in zipped_sims if similarity != 0] | |
# Select and sort top 20 most similar projects | |
sorted_sims = sorted(filtered_sims, key=lambda x: x[1], reverse=True) | |
top_20_sims = sorted_sims[:20] | |
# create result data frame | |
index_list = [tup[0] for tup in top_20_sims] | |
print(index_list) | |
result_df = projects_df.iloc[index_list] | |
print(len(result_df)) | |
print(len(result_df)) | |
# add other colums to result df | |
similarity_list = [tup[1] for tup in top_20_sims] | |
result_df["similarity"] = similarity_list | |
similarity_table.show_table(result_df, similarity_list) | |
""" | |