Geraldine's picture
Update app.py
70c3760 verified
raw
history blame
No virus
2.99 kB
import streamlit as st
import streamlit.components.v1 as components
import pandas as pd
import numpy as np
import torch
from sentence_transformers import SentenceTransformer, util
from datasets import load_dataset
from huggingface_hub import hf_hub_download
import pickle
# Set Streamlit page configuration
st.set_page_config(page_title="App", layout="wide")
st.title("Semantic Search on HAL UNIV-COTEDAZUR SHS articles from 2013 to 2023")
st.subheader("The pre-processed data are accesible and documented from this HF dataset ")
with st.spinner('Loading datasets...'):
dataset = load_dataset(
"Geraldine/hal_univcotedazur_shs_articles_2013-2023",
revision="main"
)
# data
hal_data = load_dataset("Geraldine/hal_univcotedazur_shs_articles_2013-2023", data_files="hal_data.csv")
df = pd.DataFrame(hal_data["train"])
df = df.replace(np.nan, '')
df = df.astype(str)
# embeddings
hf_hub_download(repo_id="Geraldine/hal_univcotedazur_shs_articles_2013-2023",
filename="hal_embeddings.pkl",
repo_type="dataset",
cache_dir="data", local_dir="data")
file = open("data/hal_embeddings.pkl",'rb')
corpus_embeddings = pickle.load(file)
model_id = "sentence-transformers/all-MiniLM-L6-v2"
def llm_response(query):
embedder = SentenceTransformer(model_id)
question_embedding = embedder.encode(query, convert_to_tensor=True)
hits = util.semantic_search(question_embedding, corpus_embeddings, top_k=5)
article_data_list = []
data_list = []
for hit in hits[0]:
hit_id = hit['corpus_id']
article_data = df.iloc[hit_id]
#article_data_list.append(article_data["combined"])
article_data_list.append({"title": article_data["title_s"] + ". " + article_data["subTitle_s"],
"date": article_data["producedDate_s"],
"journal" : article_data["journalTitle_s"],
"pub": article_data["journalPublisher_s"],
"abstract": article_data["abstract_s"]
})
return article_data_list
with st.container():
if query := st.text_input(
"Enter your question :"):
st.markdown(f"### :green[{model_option} results]")
with st.expander(":blue[click here to see the HAL search engine results]"):
components.iframe(f"https://hal.univ-cotedazur.fr/search/index/?q={query}&rows=30&publicationDateY_i=2023+OR+2022+OR+2021+OR+2020+OR+2019+OR+2018+OR+2017+OR+2016+OR+2015+OR+2014+OR+2013&docType_s=ART", height=800, scrolling=True)
with st.spinner('Calculating...'):
response = llm_response(query)
for x in response:
st.success("**Title** : " + x["title"] + " \n " + "**Date** : " + x["date"] + " \n " + "**Journal** : " + x["journal"] + "(" + x["pub"] + ")" + " \n " + "**Abstract** : " + x["abstract"])