|
import streamlit as st |
|
import streamlit.components.v1 as components |
|
import pandas as pd |
|
import numpy as np |
|
import torch |
|
from sentence_transformers import SentenceTransformer, util |
|
from datasets import load_dataset |
|
from huggingface_hub import hf_hub_download |
|
import pickle |
|
|
|
|
|
st.set_page_config(page_title="App", layout="wide") |
|
|
|
st.title("Semantic Search on HAL UNIV-COTEDAZUR SHS articles from 2013 to 2023") |
|
st.subheader("The pre-processed data are accesible and documented from this HF dataset ") |
|
|
|
with st.spinner('Loading datasets...'): |
|
dataset = load_dataset( |
|
"Geraldine/hal_univcotedazur_shs_articles_2013-2023", |
|
revision="main" |
|
) |
|
|
|
hal_data = load_dataset("Geraldine/hal_univcotedazur_shs_articles_2013-2023", data_files="hal_data.csv") |
|
df = pd.DataFrame(hal_data["train"]) |
|
df = df.replace(np.nan, '') |
|
df = df.astype(str) |
|
|
|
hf_hub_download(repo_id="Geraldine/hal_univcotedazur_shs_articles_2013-2023", |
|
filename="hal_embeddings.pkl", |
|
repo_type="dataset", |
|
cache_dir="data", local_dir="data") |
|
file = open("data/hal_embeddings.pkl",'rb') |
|
corpus_embeddings = pickle.load(file) |
|
|
|
model_id = "sentence-transformers/all-MiniLM-L6-v2" |
|
def llm_response(query): |
|
embedder = SentenceTransformer(model_id) |
|
question_embedding = embedder.encode(query, convert_to_tensor=True) |
|
hits = util.semantic_search(question_embedding, corpus_embeddings, top_k=5) |
|
article_data_list = [] |
|
data_list = [] |
|
for hit in hits[0]: |
|
hit_id = hit['corpus_id'] |
|
article_data = df.iloc[hit_id] |
|
|
|
article_data_list.append({"title": article_data["title_s"] + ". " + article_data["subTitle_s"], |
|
"date": article_data["producedDate_s"], |
|
"journal" : article_data["journalTitle_s"], |
|
"pub": article_data["journalPublisher_s"], |
|
"abstract": article_data["abstract_s"] |
|
}) |
|
return article_data_list |
|
|
|
with st.container(): |
|
if query := st.text_input( |
|
"Enter your question :"): |
|
st.markdown(f"### :green[{model_option} results]") |
|
with st.expander(":blue[click here to see the HAL search engine results]"): |
|
components.iframe(f"https://hal.univ-cotedazur.fr/search/index/?q={query}&rows=30&publicationDateY_i=2023+OR+2022+OR+2021+OR+2020+OR+2019+OR+2018+OR+2017+OR+2016+OR+2015+OR+2014+OR+2013&docType_s=ART", height=800, scrolling=True) |
|
with st.spinner('Calculating...'): |
|
response = llm_response(query) |
|
for x in response: |
|
st.success("**Title** : " + x["title"] + " \n " + "**Date** : " + x["date"] + " \n " + "**Journal** : " + x["journal"] + "(" + x["pub"] + ")" + " \n " + "**Abstract** : " + x["abstract"]) |
|
|