Spaces:
Sleeping
Sleeping
| import streamlit as st | |
| import pandas as pd | |
| from datasets import Dataset | |
| from sentence_transformers import SentenceTransformer | |
| from sentence_transformers.util import semantic_search | |
| import torch | |
| model = SentenceTransformer("sentence-transformers/gtr-t5-large") | |
| # Read files | |
| url = "https://gist.githubusercontent.com/fer-aguirre/b6bdcf59ecae41f84765f72114de9fd1/raw/b4e029fe236c1f38275621686429b2c7aaa3d18b/embeddings.csv" | |
| df_emb = pd.read_csv(url, index_col=0) | |
| df = pd.read_csv('./foia_sample.csv') | |
| dataset = Dataset.from_pandas(df_emb) | |
| dataset_embeddings = torch.from_numpy(dataset.to_pandas().to_numpy()).to(torch.float) | |
| st.markdown("**Inserta una solicitud de información para generar recomendaciones de dependencias**") | |
| if request := st.text_area("", value=""): | |
| output = model.encode(request) | |
| query_embeddings = torch.FloatTensor(output) | |
| hits = semantic_search(query_embeddings, dataset_embeddings, top_k=3) | |
| id1 = hits[0][0]['corpus_id'] | |
| id2 = hits[0][1]['corpus_id'] | |
| id3 = hits[0][2]['corpus_id'] | |
| rec1 = df.iloc[id1].str.split(pat="/")[0] | |
| rec2 = df.iloc[id2].str.split(pat="/")[0] | |
| rec3 = df.iloc[id3].str.split(pat="/")[0] | |
| list_rec = [rec1, rec2, rec3] | |
| unique_list = [] | |
| for string in list_rec: | |
| if string not in unique_list: | |
| unique_list.append(string) | |
| st.markdown(f'Recomendaciones:') | |
| for rec in unique_list: | |
| st.markdown(f':green[{rec[0]}]') | |
| st.markdown("""---""") | |
| if st.button('Genera un ejemplo random'): | |
| test_example = df['combined'].sample(n=1) | |
| index = test_example.index | |
| idx = index[0] | |
| original = df.iloc[idx].str.split(pat="/")[0] | |
| request = test_example.to_string(index=False) | |
| st.text(f'{idx}, {request}') | |
| output = model.encode(request) | |
| query_embeddings = torch.FloatTensor(output) | |
| hits = semantic_search(query_embeddings, dataset_embeddings, top_k=3) | |
| id1 = hits[0][0]['corpus_id'] | |
| id2 = hits[0][1]['corpus_id'] | |
| id3 = hits[0][2]['corpus_id'] | |
| rec1 = df.iloc[id1].str.split(pat="/")[0] | |
| rec2 = df.iloc[id2].str.split(pat="/")[0] | |
| rec3 = df.iloc[id3].str.split(pat="/")[0] | |
| list_rec = [rec1, rec2, rec3] | |
| unique_list = [] | |
| for string in list_rec: | |
| if string not in unique_list: | |
| unique_list.append(string) | |
| st.markdown(f'Recomendaciones:') | |
| for rec in unique_list: | |
| st.markdown(f':green[{rec[0]}]') | |
| st.markdown(f'Dependencia original:') | |
| st.markdown(f':red[{original[0]}]') | |