Spaces:

Geraldine
/

HAL-UNIV-COTEDAZUR_semantic_search

Sleeping

App Files Files Community

Geraldine commited on May 22

Commit

70c3760

•

1 Parent(s): 30883d6

Update app.py

Browse files

Files changed (1) hide show

app.py +26 -14

app.py CHANGED Viewed

@@ -4,21 +4,38 @@ import pandas as pd
 import numpy as np
 import torch
 from sentence_transformers import SentenceTransformer, util
 # Set Streamlit page configuration
 st.set_page_config(page_title="App", layout="wide")
-st.title("Semantic Search on HAL UNIV-COTEDAZUR Collection (articles)")
-with st.spinner('Loading dataset...'):
-    df = pd.read_csv("data/hal_articles.csv", sep=",", encoding="utf-8")
     df = df.replace(np.nan, '')
     df = df.astype(str)
-def llm_response(query, model_option):
-    embedder = SentenceTransformer(model_option)
     question_embedding = embedder.encode(query, convert_to_tensor=True)
-    corpus_embeddings = torch.load(f"embeddings/embeddings_{model_option}.pt", map_location=torch.device('cpu'))
     hits = util.semantic_search(question_embedding, corpus_embeddings, top_k=5)
     article_data_list = []
     data_list = []
@@ -34,18 +51,13 @@ def llm_response(query, model_option):
                                   })
     return article_data_list
-models = ['all-MiniLM-L6-v2', 'all-mpnet-base-v2']
-model_option = st.sidebar.selectbox("Choose the open embeddings model to use ?", models)
-#OPENAI_API_KEY = st.sidebar.text_input(":green[Optional : Enter your OPENAi API KEY here :]")
 with st.container():
     if query := st.text_input(
         "Enter your question :"):
         st.markdown(f"### :green[{model_option} results]")
         with st.expander(":blue[click here to see the HAL search engine results]"):
-            components.iframe(f"https://hal.univ-cotedazur.fr/search/index/?q={query}&rows=30&docType_s=ART", height=800, scrolling=True)
         with st.spinner('Calculating...'):
-            response = llm_response(query, model_option)
             for x in response:
                 st.success("**Title** : " + x["title"] + "  \n  " + "**Date** : " + x["date"] + "  \n  " + "**Journal** : " + x["journal"] + "(" + x["pub"] + ")" + "  \n  " + "**Abstract** : " + x["abstract"])

 import numpy as np
 import torch
 from sentence_transformers import SentenceTransformer, util
+from datasets import load_dataset
+from huggingface_hub import hf_hub_download
+import pickle
 # Set Streamlit page configuration
 st.set_page_config(page_title="App", layout="wide")
+st.title("Semantic Search on HAL UNIV-COTEDAZUR SHS articles from 2013 to 2023")
+st.subheader("The pre-processed data are accesible and documented from this HF dataset ")
+with st.spinner('Loading datasets...'):
+    dataset = load_dataset(
+      "Geraldine/hal_univcotedazur_shs_articles_2013-2023",
+       revision="main"
+       )
+    # data
+    hal_data = load_dataset("Geraldine/hal_univcotedazur_shs_articles_2013-2023", data_files="hal_data.csv")
+    df = pd.DataFrame(hal_data["train"])
     df = df.replace(np.nan, '')
     df = df.astype(str)
+    # embeddings
+    hf_hub_download(repo_id="Geraldine/hal_univcotedazur_shs_articles_2013-2023",
+                filename="hal_embeddings.pkl",
+                repo_type="dataset",
+                cache_dir="data", local_dir="data")
+    file = open("data/hal_embeddings.pkl",'rb')
+    corpus_embeddings = pickle.load(file)
+model_id = "sentence-transformers/all-MiniLM-L6-v2"
+def llm_response(query):
+    embedder = SentenceTransformer(model_id)
     question_embedding = embedder.encode(query, convert_to_tensor=True)
     hits = util.semantic_search(question_embedding, corpus_embeddings, top_k=5)
     article_data_list = []
     data_list = []
                                   })
     return article_data_list
 with st.container():
     if query := st.text_input(
         "Enter your question :"):
         st.markdown(f"### :green[{model_option} results]")
         with st.expander(":blue[click here to see the HAL search engine results]"):
+            components.iframe(f"https://hal.univ-cotedazur.fr/search/index/?q={query}&rows=30&publicationDateY_i=2023+OR+2022+OR+2021+OR+2020+OR+2019+OR+2018+OR+2017+OR+2016+OR+2015+OR+2014+OR+2013&docType_s=ART", height=800, scrolling=True)
         with st.spinner('Calculating...'):
+            response = llm_response(query)
             for x in response:
                 st.success("**Title** : " + x["title"] + "  \n  " + "**Date** : " + x["date"] + "  \n  " + "**Journal** : " + x["journal"] + "(" + x["pub"] + ")" + "  \n  " + "**Abstract** : " + x["abstract"])