import streamlit as st import streamlit.components.v1 as components import pandas as pd import numpy as np import torch from sentence_transformers import SentenceTransformer, util # Set Streamlit page configuration st.set_page_config(page_title="App", layout="wide") st.title("Semantic Search on HAL UNIV-COTEDAZUR Collection (articles)") with st.spinner('Loading dataset...'): df = pd.read_csv("data/hal_articles.csv", sep=",", encoding="utf-8") df = df.replace(np.nan, '') df = df.astype(str) def llm_response(query, model_option): embedder = SentenceTransformer(model_option) question_embedding = embedder.encode(query, convert_to_tensor=True) corpus_embeddings = torch.load(f"embeddings/embeddings_{model_option}.pt", map_location=torch.device('cpu')) hits = util.semantic_search(question_embedding, corpus_embeddings, top_k=5) article_data_list = [] data_list = [] for hit in hits[0]: hit_id = hit['corpus_id'] article_data = df.iloc[hit_id] #article_data_list.append(article_data["combined"]) article_data_list.append({"title": article_data["title_s"] + ". " + article_data["subTitle_s"], "date": article_data["producedDate_s"], "journal" : article_data["journalTitle_s"], "pub": article_data["journalPublisher_s"], "abstract": article_data["abstract_s"] }) return article_data_list models = ['all-MiniLM-L6-v2', 'all-mpnet-base-v2'] model_option = st.sidebar.selectbox("Choose the open embeddings model to use ?", models) #OPENAI_API_KEY = st.sidebar.text_input(":green[Optional : Enter your OPENAi API KEY here :]") with st.container(): if query := st.text_input( "Enter your question :"): st.markdown(f"### :green[{model_option} results]") with st.expander(":blue[click here to see the HAL search engine results]"): components.iframe(f"https://hal.univ-cotedazur.fr/search/index/?q={query}&rows=30&docType_s=ART", height=800, scrolling=True) with st.spinner('Calculating...'): response = llm_response(query, model_option) for x in response: st.success("**Title** : " + x["title"] + " \n " + "**Date** : " + x["date"] + " \n " + "**Journal** : " + x["journal"] + "(" + x["pub"] + ")" + " \n " + "**Abstract** : " + x["abstract"])