|
import streamlit as st |
|
import streamlit.components.v1 as components |
|
import pandas as pd |
|
import numpy as np |
|
import torch |
|
from sentence_transformers import SentenceTransformer, util |
|
|
|
|
|
st.set_page_config(page_title="App", layout="wide") |
|
|
|
st.title("Semantic Search on HAL UNIV-COTEDAZUR Collection (articles)") |
|
|
|
with st.spinner('Loading dataset...'): |
|
df = pd.read_csv("data/hal_articles.csv", sep=",", encoding="utf-8") |
|
df = df.replace(np.nan, '') |
|
df = df.astype(str) |
|
|
|
def llm_response(query, model_option): |
|
embedder = SentenceTransformer(model_option) |
|
question_embedding = embedder.encode(query, convert_to_tensor=True) |
|
corpus_embeddings = torch.load(f"embeddings/embeddings_{model_option}.pt", map_location=torch.device('cpu')) |
|
hits = util.semantic_search(question_embedding, corpus_embeddings, top_k=5) |
|
article_data_list = [] |
|
data_list = [] |
|
for hit in hits[0]: |
|
hit_id = hit['corpus_id'] |
|
article_data = df.iloc[hit_id] |
|
|
|
article_data_list.append({"title": article_data["title_s"] + ". " + article_data["subTitle_s"], |
|
"date": article_data["producedDate_s"], |
|
"journal" : article_data["journalTitle_s"], |
|
"pub": article_data["journalPublisher_s"], |
|
"abstract": article_data["abstract_s"] |
|
}) |
|
return article_data_list |
|
|
|
models = ['all-MiniLM-L6-v2', 'all-mpnet-base-v2'] |
|
model_option = st.sidebar.selectbox("Choose the open embeddings model to use ?", models) |
|
|
|
|
|
|
|
with st.container(): |
|
if query := st.text_input( |
|
"Enter your question :"): |
|
st.markdown(f"### :green[{model_option} results]") |
|
with st.expander(":blue[click here to see the HAL search engine results]"): |
|
components.iframe(f"https://hal.univ-cotedazur.fr/search/index/?q={query}&rows=30&docType_s=ART", height=800, scrolling=True) |
|
with st.spinner('Calculating...'): |
|
response = llm_response(query, model_option) |
|
for x in response: |
|
st.success("**Title** : " + x["title"] + " \n " + "**Date** : " + x["date"] + " \n " + "**Journal** : " + x["journal"] + "(" + x["pub"] + ")" + " \n " + "**Abstract** : " + x["abstract"]) |
|
|