Geraldine commited on
Commit
70c3760
1 Parent(s): 30883d6

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +26 -14
app.py CHANGED
@@ -4,21 +4,38 @@ import pandas as pd
4
  import numpy as np
5
  import torch
6
  from sentence_transformers import SentenceTransformer, util
 
 
 
7
 
8
  # Set Streamlit page configuration
9
  st.set_page_config(page_title="App", layout="wide")
10
 
11
- st.title("Semantic Search on HAL UNIV-COTEDAZUR Collection (articles)")
 
12
 
13
- with st.spinner('Loading dataset...'):
14
- df = pd.read_csv("data/hal_articles.csv", sep=",", encoding="utf-8")
 
 
 
 
 
 
15
  df = df.replace(np.nan, '')
16
  df = df.astype(str)
17
-
18
- def llm_response(query, model_option):
19
- embedder = SentenceTransformer(model_option)
 
 
 
 
 
 
 
 
20
  question_embedding = embedder.encode(query, convert_to_tensor=True)
21
- corpus_embeddings = torch.load(f"embeddings/embeddings_{model_option}.pt", map_location=torch.device('cpu'))
22
  hits = util.semantic_search(question_embedding, corpus_embeddings, top_k=5)
23
  article_data_list = []
24
  data_list = []
@@ -34,18 +51,13 @@ def llm_response(query, model_option):
34
  })
35
  return article_data_list
36
 
37
- models = ['all-MiniLM-L6-v2', 'all-mpnet-base-v2']
38
- model_option = st.sidebar.selectbox("Choose the open embeddings model to use ?", models)
39
-
40
- #OPENAI_API_KEY = st.sidebar.text_input(":green[Optional : Enter your OPENAi API KEY here :]")
41
-
42
  with st.container():
43
  if query := st.text_input(
44
  "Enter your question :"):
45
  st.markdown(f"### :green[{model_option} results]")
46
  with st.expander(":blue[click here to see the HAL search engine results]"):
47
- components.iframe(f"https://hal.univ-cotedazur.fr/search/index/?q={query}&rows=30&docType_s=ART", height=800, scrolling=True)
48
  with st.spinner('Calculating...'):
49
- response = llm_response(query, model_option)
50
  for x in response:
51
  st.success("**Title** : " + x["title"] + " \n " + "**Date** : " + x["date"] + " \n " + "**Journal** : " + x["journal"] + "(" + x["pub"] + ")" + " \n " + "**Abstract** : " + x["abstract"])
 
4
  import numpy as np
5
  import torch
6
  from sentence_transformers import SentenceTransformer, util
7
+ from datasets import load_dataset
8
+ from huggingface_hub import hf_hub_download
9
+ import pickle
10
 
11
  # Set Streamlit page configuration
12
  st.set_page_config(page_title="App", layout="wide")
13
 
14
+ st.title("Semantic Search on HAL UNIV-COTEDAZUR SHS articles from 2013 to 2023")
15
+ st.subheader("The pre-processed data are accesible and documented from this HF dataset ")
16
 
17
+ with st.spinner('Loading datasets...'):
18
+ dataset = load_dataset(
19
+ "Geraldine/hal_univcotedazur_shs_articles_2013-2023",
20
+ revision="main"
21
+ )
22
+ # data
23
+ hal_data = load_dataset("Geraldine/hal_univcotedazur_shs_articles_2013-2023", data_files="hal_data.csv")
24
+ df = pd.DataFrame(hal_data["train"])
25
  df = df.replace(np.nan, '')
26
  df = df.astype(str)
27
+ # embeddings
28
+ hf_hub_download(repo_id="Geraldine/hal_univcotedazur_shs_articles_2013-2023",
29
+ filename="hal_embeddings.pkl",
30
+ repo_type="dataset",
31
+ cache_dir="data", local_dir="data")
32
+ file = open("data/hal_embeddings.pkl",'rb')
33
+ corpus_embeddings = pickle.load(file)
34
+
35
+ model_id = "sentence-transformers/all-MiniLM-L6-v2"
36
+ def llm_response(query):
37
+ embedder = SentenceTransformer(model_id)
38
  question_embedding = embedder.encode(query, convert_to_tensor=True)
 
39
  hits = util.semantic_search(question_embedding, corpus_embeddings, top_k=5)
40
  article_data_list = []
41
  data_list = []
 
51
  })
52
  return article_data_list
53
 
 
 
 
 
 
54
  with st.container():
55
  if query := st.text_input(
56
  "Enter your question :"):
57
  st.markdown(f"### :green[{model_option} results]")
58
  with st.expander(":blue[click here to see the HAL search engine results]"):
59
+ components.iframe(f"https://hal.univ-cotedazur.fr/search/index/?q={query}&rows=30&publicationDateY_i=2023+OR+2022+OR+2021+OR+2020+OR+2019+OR+2018+OR+2017+OR+2016+OR+2015+OR+2014+OR+2013&docType_s=ART", height=800, scrolling=True)
60
  with st.spinner('Calculating...'):
61
+ response = llm_response(query)
62
  for x in response:
63
  st.success("**Title** : " + x["title"] + " \n " + "**Date** : " + x["date"] + " \n " + "**Journal** : " + x["journal"] + "(" + x["pub"] + ")" + " \n " + "**Abstract** : " + x["abstract"])