Geraldine's picture
Update app.py
ce243f1
raw
history blame
No virus
2.49 kB
import streamlit as st
import streamlit.components.v1 as components
import pandas as pd
import numpy as np
import torch
from sentence_transformers import SentenceTransformer, util
# Set Streamlit page configuration
st.set_page_config(page_title="App", layout="wide")
st.title("Semantic Search on HAL UNIV-COTEDAZUR Collection (articles)")
with st.spinner('Loading dataset...'):
df = pd.read_csv("data/hal_articles.csv", sep=",", encoding="utf-8")
df = df.replace(np.nan, '')
df = df.astype(str)
def llm_response(query, model_option):
embedder = SentenceTransformer(model_option)
question_embedding = embedder.encode(query, convert_to_tensor=True)
corpus_embeddings = torch.load(f"embeddings/embeddings_{model_option}.pt", map_location=torch.device('cpu'))
hits = util.semantic_search(question_embedding, corpus_embeddings, top_k=5)
article_data_list = []
data_list = []
for hit in hits[0]:
hit_id = hit['corpus_id']
article_data = df.iloc[hit_id]
#article_data_list.append(article_data["combined"])
article_data_list.append({"title": article_data["title_s"] + ". " + article_data["subTitle_s"],
"date": article_data["producedDate_s"],
"journal" : article_data["journalTitle_s"],
"pub": article_data["journalPublisher_s"],
"abstract": article_data["abstract_s"]
})
return article_data_list
models = ['all-MiniLM-L6-v2', 'all-mpnet-base-v2']
model_option = st.sidebar.selectbox("Choose the open embeddings model to use ?", models)
#OPENAI_API_KEY = st.sidebar.text_input(":green[Optional : Enter your OPENAi API KEY here :]")
with st.container():
if query := st.text_input(
"Enter your question :"):
st.markdown(f"### :green[{model_option} results]")
with st.expander(":blue[click here to see the HAL search engine results]"):
components.iframe(f"https://hal.univ-cotedazur.fr/search/index/?q={query}&rows=30&docType_s=ART", height=800, scrolling=True)
with st.spinner('Calculating...'):
response = llm_response(query, model_option)
for x in response:
st.success("**Title** : " + x["title"] + " \n " + "**Date** : " + x["date"] + " \n " + "**Journal** : " + x["journal"] + "(" + x["pub"] + ")" + " \n " + "**Abstract** : " + x["abstract"])