Upload 6 files
Browse files- .gitattributes +1 -0
- app.py +44 -19
- data/hal_articles.csv +3 -0
- embeddings/embeddings_all-MiniLM-L6-v2.pt +3 -0
- embeddings/embeddings_all-mpnet-base-v2.pt +3 -0
- pages/documentation.py +63 -22
- requirements.txt +4 -2
.gitattributes
CHANGED
@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
|
33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
|
|
|
33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
36 |
+
data/hal_articles.csv filter=lfs diff=lfs merge=lfs -text
|
app.py
CHANGED
@@ -1,29 +1,54 @@
|
|
1 |
import streamlit as st
|
2 |
-
import
|
3 |
-
import
|
4 |
-
|
5 |
import torch
|
6 |
from sentence_transformers import SentenceTransformer, util
|
7 |
|
8 |
# Set Streamlit page configuration
|
9 |
st.set_page_config(page_title="App", layout="wide")
|
10 |
|
11 |
-
st.title("HAL UNIV-COTEDAZUR Collection
|
12 |
|
13 |
-
|
|
|
|
|
|
|
14 |
|
15 |
-
|
16 |
-
|
17 |
-
|
18 |
-
|
19 |
-
|
20 |
-
|
21 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
22 |
|
23 |
-
|
24 |
-
|
25 |
-
|
26 |
-
|
27 |
-
|
28 |
-
|
29 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
import streamlit as st
|
2 |
+
import streamlit.components.v1 as components
|
3 |
+
import pandas as pd
|
4 |
+
import numpy as np
|
5 |
import torch
|
6 |
from sentence_transformers import SentenceTransformer, util
|
7 |
|
8 |
# Set Streamlit page configuration
|
9 |
st.set_page_config(page_title="App", layout="wide")
|
10 |
|
11 |
+
st.title("Semantic Search on HAL UNIV-COTEDAZUR Collection (articles)")
|
12 |
|
13 |
+
with st.spinner('Loading dataset...'):
|
14 |
+
df = pd.read_csv("data/hal_articles.csv", sep=",", encoding="utf-8")
|
15 |
+
df = df.replace(np.nan, '')
|
16 |
+
df = df.astype(str)
|
17 |
|
18 |
+
def huggingface_response(query, model_option):
|
19 |
+
embedder = SentenceTransformer(model_option)
|
20 |
+
question_embedding = embedder.encode(query, convert_to_tensor=True)
|
21 |
+
corpus_embeddings = torch.load(f"embeddings/embeddings_{model_option}.pt", map_location=torch.device('cpu'))
|
22 |
+
hits = util.semantic_search(question_embedding, corpus_embeddings, top_k=5)
|
23 |
+
article_data_list = []
|
24 |
+
data_list = []
|
25 |
+
for hit in hits[0]:
|
26 |
+
hit_id = hit['corpus_id']
|
27 |
+
article_data = df.iloc[hit_id]
|
28 |
+
#article_data_list.append(article_data["combined"])
|
29 |
+
article_data_list.append({"title": article_data["title_s"] + ". " + article_data["subTitle_s"],
|
30 |
+
"date": article_data["producedDate_s"],
|
31 |
+
"journal" : article_data["journalTitle_s"],
|
32 |
+
"pub": article_data["journalPublisher_s"],
|
33 |
+
"abstract": article_data["abstract_s"]
|
34 |
+
})
|
35 |
+
return article_data_list
|
36 |
|
37 |
+
models = ['all-MiniLM-L6-v2', 'all-mpnet-base-v2']
|
38 |
+
model_option = st.sidebar.selectbox("Choose the open embeddings model to use ?", models)
|
39 |
+
|
40 |
+
#OPENAI_API_KEY = st.sidebar.text_input(":green[Optional : Enter your OPENAi API KEY here :]")
|
41 |
+
|
42 |
+
with st.container():
|
43 |
+
if query := st.text_input(
|
44 |
+
":green[Enter your question :]"):
|
45 |
+
st.markdown(f"### :blue[{model_option} results]")
|
46 |
+
with st.spinner('Calculating...'):
|
47 |
+
response = huggingface_response(query, model_option)
|
48 |
+
for x in response:
|
49 |
+
st.success("**Title** : " + x["title"] + " \n " + "**Date** : " + x["date"] + " \n " + "**Journal** : " + x["journal"] + "(" + x["pub"] + ")" + " \n " + "**Abstract** : " + x["abstract"])
|
50 |
+
#st.write(huggingface_response(query, model_option))
|
51 |
+
#for x in response:
|
52 |
+
# st.write(x["titre"] + "\n" + x["date"] + "\n" + "------" + "\n")
|
53 |
+
st.markdown("### HAL search engine results")
|
54 |
+
components.iframe(f"https://hal.univ-cotedazur.fr/search/index/?q={query}&rows=30&docType_s=ART", height=800, scrolling=True)
|
data/hal_articles.csv
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:b0e090cb11c263b57f1c980e962c255b4ebeecd8405cd6bb7898b05bc0f20445
|
3 |
+
size 55308538
|
embeddings/embeddings_all-MiniLM-L6-v2.pt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:2eed5d5df2835ca6139be11a84c8fc728befaa4804e0311b1f3db8816f7a84c2
|
3 |
+
size 34725223
|
embeddings/embeddings_all-mpnet-base-v2.pt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:02478d4a23588730d3a4cb9a51af539bf2b5223e648d7bd9ba3b7c7a0ea0a51f
|
3 |
+
size 69449578
|
pages/documentation.py
CHANGED
@@ -10,35 +10,60 @@ st.header("Dataset creation")
|
|
10 |
|
11 |
st.subheader(":blue[HAL API harvest]")
|
12 |
|
|
|
13 |
st.write("All the API documentation is available [here](https://api.archives-ouvertes.fr/docs/search)")
|
14 |
-
st.write("All records of article type publications reported in the UNIV-COTEDAZUR collection of HAL are obtained with this
|
15 |
st.code("""
|
16 |
-
|
17 |
-
|
18 |
-
|
19 |
-
|
20 |
-
|
21 |
-
data = json.loads(response)
|
22 |
-
for doc in data["response"]["docs"]:
|
23 |
-
global_list.append(doc)
|
24 |
-
if len(data["response"]["docs"]) != 0:
|
25 |
-
return recursive_hal_harvest(cursor=data["nextCursorMark"])
|
26 |
-
else:
|
27 |
-
return global_list
|
28 |
-
df = pd.DataFrame(recursive_hal_harvest())
|
29 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
30 |
""", language='python')
|
31 |
|
32 |
st.write("The dataframe's colmumns of metadata are then concatenated into a single combined text in a new column. It is therefore on this new column that the different embeddings models will be applied to encode this combined text and output a single vector embedding.")
|
33 |
st.code("""
|
34 |
df = df.astype(str)
|
35 |
df["combined"] = (
|
36 |
-
"Title: " + df.title_s + "
|
37 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
38 |
|
39 |
""", language='python')
|
40 |
|
41 |
-
st.subheader(":blue[OpenAI Embeddings]")
|
|
|
|
|
42 |
|
43 |
st.code("""
|
44 |
import openai
|
@@ -57,12 +82,28 @@ encoding = tiktoken.get_encoding(embedding_encoding)
|
|
57 |
df["n_tokens"] = df.combined.apply(lambda x: len(encoding.encode(x)))
|
58 |
df = df[df.n_tokens <= max_tokens]
|
59 |
|
60 |
-
#
|
61 |
-
|
62 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
63 |
|
64 |
""", language='python')
|
65 |
|
66 |
-
st.
|
67 |
|
68 |
-
st.write("The
|
|
|
10 |
|
11 |
st.subheader(":blue[HAL API harvest]")
|
12 |
|
13 |
+
st.write("HAL is the french national open archive for scientific publications based on the principles of open access and self-archiving.")
|
14 |
st.write("All the API documentation is available [here](https://api.archives-ouvertes.fr/docs/search)")
|
15 |
+
st.write("All records of article type publications reported in the UNIV-COTEDAZUR collection of HAL are obtained with this looping function that populates a pandas Dataframe as output ")
|
16 |
st.code("""
|
17 |
+
# we retrieve first the total number of records
|
18 |
+
url_for_total_count = "https://api.archives-ouvertes.fr/search/UNIV-COTEDAZUR/?q=docType_s:ART&rows=0"
|
19 |
+
response = requests.request("GET", url_for_total_count).text
|
20 |
+
data = json.loads(response)
|
21 |
+
total_count = data["response"]["numFound"]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
22 |
|
23 |
+
""", language='python')
|
24 |
+
st.code("""
|
25 |
+
step = 1000
|
26 |
+
df = []
|
27 |
+
for i in range(1, int(total_count), int(step)):
|
28 |
+
url = f"https://api.archives-ouvertes.fr/search/UNIV-COTEDAZUR/?q=docType_s:ART&rows={step}&start={i}&wt=csv&fl=uri_s,title_s,subTitle_s,authFullName_s,producedDate_s,domain_t,journalTitle_s,journalPublisher_s,abstract_s"
|
29 |
+
data = pd.read_csv(url, encoding="utf-8")
|
30 |
+
df.append(data)
|
31 |
+
df = pd.concat(df)
|
32 |
+
# clean up a little bit
|
33 |
+
df = df.drop_duplicates(subset=['uri_s'])
|
34 |
+
df = df.replace(np.nan, '')
|
35 |
""", language='python')
|
36 |
|
37 |
st.write("The dataframe's colmumns of metadata are then concatenated into a single combined text in a new column. It is therefore on this new column that the different embeddings models will be applied to encode this combined text and output a single vector embedding.")
|
38 |
st.code("""
|
39 |
df = df.astype(str)
|
40 |
df["combined"] = (
|
41 |
+
"Title: " + df.title_s + ". " + df.subTitle_s + ". Authors :" + df.authFullName_s + ". Publication date :" + df.producedDate_s + ". Journal title :" + df.journalTitle_s + ". Publisher :" + df.journalPublisher_s + ". Astract : " + df.abstract_s
|
42 |
+
|
43 |
+
""", language='python')
|
44 |
+
|
45 |
+
st.subheader(":blue[Huggingface open models for Embeddings]")
|
46 |
+
|
47 |
+
st.write("The open source Huggingface platform hosts a large number of pre-trained models that can then be reused for many tasks (text or image classification, summarization, document QA etc...). We can then use the sentence-transformers library applied on some of these available embedding pre-trained models for creating embeddings.")
|
48 |
+
st.write("There is two ways of working with the Huggingface hosted models : by using the [inference API endpoint](https://huggingface.co/inference-api) or by locally importing the model. Here we choose the second way")
|
49 |
+
st.write("Two open source transformers-based models have been used to convert the textual metadata into numerical vector representation, which generated two vector embeddings datasets : embeddings_all-MiniLM-L6-v2.pt and embeddings_multi-qa-mpnet-base-dot-v1.pt")
|
50 |
+
st.code("""
|
51 |
+
import torch
|
52 |
+
from sentence_transformers import SentenceTransformer
|
53 |
+
|
54 |
+
embedder = SentenceTransformer('all-MiniLM-L6-v2') # or 'multi-qa-mpnet-base-dot-v1'
|
55 |
+
|
56 |
+
corpus_embeddings = embedder.encode(df.combined, convert_to_tensor=True)
|
57 |
+
|
58 |
+
# how to save and reload
|
59 |
+
torch.save(corpus_embeddings, f"{LOCAL_PATH}/embeddings_all-MiniLM-L6-v2.pt")
|
60 |
+
corpus_embeddings = torch.load(f"{LOCAL_PATH}/embeddings_all-MiniLM-L6-v2.pt")
|
61 |
|
62 |
""", language='python')
|
63 |
|
64 |
+
st.subheader(":blue[Bonus : OpenAI Embeddings]")
|
65 |
+
|
66 |
+
st.write("If you want to do the same with text-embedding-ada-002 (the OpenAI embeddings model)")
|
67 |
|
68 |
st.code("""
|
69 |
import openai
|
|
|
82 |
df["n_tokens"] = df.combined.apply(lambda x: len(encoding.encode(x)))
|
83 |
df = df[df.n_tokens <= max_tokens]
|
84 |
|
85 |
+
# generate embeddings
|
86 |
+
def custom_get_embedding(text: str) -> list[float]:
|
87 |
+
return openai.Embedding.create(input=[text], model="text-embedding-ada-002")["data"][0]["embedding"]
|
88 |
+
df["openai_embedding"] = df.combined.apply(lambda x: custom_get_embedding(x) )
|
89 |
+
|
90 |
+
""", language='python')
|
91 |
+
|
92 |
+
st.write("And the Steamlit UI code would be :")
|
93 |
+
|
94 |
+
st.code("""
|
95 |
+
df["openai_embedding"] = df.openai_embedding.apply(literal_eval).apply(np.array)
|
96 |
+
def custom_get_embedding(text: str) -> list[float]:
|
97 |
+
return openai.Embedding.create(input=[text], model="text-embedding-ada-002", openai_api_key=OPENAI_API_KEY)["data"][0]["embedding"]
|
98 |
+
def openai_response(query):
|
99 |
+
query_embedding = np.array(custom_get_embedding(
|
100 |
+
query
|
101 |
+
))
|
102 |
+
df["similarity"] = df.openai_embedding.apply(lambda x: cosine_similarity(x, query_embedding))
|
103 |
+
return df.sort_values("similarity", ascending=False).head(5).to_json(orient="records")
|
104 |
|
105 |
""", language='python')
|
106 |
|
107 |
+
st.header("Dataset hosting")
|
108 |
|
109 |
+
st.write("The csv file of the dataset is avalaible in the data folder")
|
requirements.txt
CHANGED
@@ -12,6 +12,7 @@ colorama==0.4.6
|
|
12 |
contourpy==1.1.0
|
13 |
cycler==0.11.0
|
14 |
decorator==5.1.1
|
|
|
15 |
filelock==3.12.2
|
16 |
fonttools==4.41.0
|
17 |
frozenlist==1.4.0
|
@@ -32,10 +33,10 @@ matplotlib==3.7.2
|
|
32 |
mdurl==0.1.2
|
33 |
mpmath==1.3.0
|
34 |
multidict==6.0.4
|
|
|
35 |
networkx==3.1
|
36 |
nltk==3.8.1
|
37 |
numpy==1.25.1
|
38 |
-
openai==0.27.8
|
39 |
packaging==23.1
|
40 |
pandas==2.0.3
|
41 |
Pillow==9.5.0
|
@@ -62,7 +63,7 @@ sentence-transformers==2.2.2
|
|
62 |
sentencepiece==0.1.99
|
63 |
six==1.16.0
|
64 |
smmap==5.0.0
|
65 |
-
streamlit==1.
|
66 |
sympy==1.12
|
67 |
tenacity==8.2.2
|
68 |
threadpoolctl==3.2.0
|
@@ -81,5 +82,6 @@ tzlocal==4.3.1
|
|
81 |
urllib3==2.0.3
|
82 |
validators==0.20.0
|
83 |
watchdog==3.0.0
|
|
|
84 |
yarl==1.9.2
|
85 |
zipp==3.16.2
|
|
|
12 |
contourpy==1.1.0
|
13 |
cycler==0.11.0
|
14 |
decorator==5.1.1
|
15 |
+
dill==0.3.6
|
16 |
filelock==3.12.2
|
17 |
fonttools==4.41.0
|
18 |
frozenlist==1.4.0
|
|
|
33 |
mdurl==0.1.2
|
34 |
mpmath==1.3.0
|
35 |
multidict==6.0.4
|
36 |
+
multiprocess==0.70.14
|
37 |
networkx==3.1
|
38 |
nltk==3.8.1
|
39 |
numpy==1.25.1
|
|
|
40 |
packaging==23.1
|
41 |
pandas==2.0.3
|
42 |
Pillow==9.5.0
|
|
|
63 |
sentencepiece==0.1.99
|
64 |
six==1.16.0
|
65 |
smmap==5.0.0
|
66 |
+
streamlit==1.25.0
|
67 |
sympy==1.12
|
68 |
tenacity==8.2.2
|
69 |
threadpoolctl==3.2.0
|
|
|
82 |
urllib3==2.0.3
|
83 |
validators==0.20.0
|
84 |
watchdog==3.0.0
|
85 |
+
xxhash==3.2.0
|
86 |
yarl==1.9.2
|
87 |
zipp==3.16.2
|