Geraldine commited on
Commit
1688a82
1 Parent(s): 42cb173

Upload 6 files

Browse files
.gitattributes CHANGED
@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ data/hal_articles.csv filter=lfs diff=lfs merge=lfs -text
app.py CHANGED
@@ -1,29 +1,54 @@
1
  import streamlit as st
2
- import openai
3
- import tiktoken
4
- from openai.embeddings_utils import get_embedding, cosine_similarity
5
  import torch
6
  from sentence_transformers import SentenceTransformer, util
7
 
8
  # Set Streamlit page configuration
9
  st.set_page_config(page_title="App", layout="wide")
10
 
11
- st.title("HAL UNIV-COTEDAZUR Collection Semantic Search")
12
 
13
- model_options = st.sidebar.selectbox("Choose the free embeddings model to use ?", ('all-MiniLM-L6-v2', 'multi-qa-mpnet-base-dot-v1'))
 
 
 
14
 
15
- if ok_openai := st.sidebar.checkbox('You want to use the OpenAI Embeddings too'):
16
- if openai_api_key := st.text_input(
17
- ":blue[Put Your OPENAI API-KEY :]",
18
- placeholder="Paste your OpenAI API key here ",
19
- type="password",
20
- ):
21
- st.success("great")
 
 
 
 
 
 
 
 
 
 
 
22
 
23
- col1, col2 = st.columns(2)
24
- if query := st.text_input(
25
- ":orange[Enter your question :]"):
26
- with col1:
27
- st.write("OpenAI place")
28
- with col2:
29
- st.write("Open LLM place")
 
 
 
 
 
 
 
 
 
 
 
 
1
  import streamlit as st
2
+ import streamlit.components.v1 as components
3
+ import pandas as pd
4
+ import numpy as np
5
  import torch
6
  from sentence_transformers import SentenceTransformer, util
7
 
8
  # Set Streamlit page configuration
9
  st.set_page_config(page_title="App", layout="wide")
10
 
11
+ st.title("Semantic Search on HAL UNIV-COTEDAZUR Collection (articles)")
12
 
13
+ with st.spinner('Loading dataset...'):
14
+ df = pd.read_csv("data/hal_articles.csv", sep=",", encoding="utf-8")
15
+ df = df.replace(np.nan, '')
16
+ df = df.astype(str)
17
 
18
+ def huggingface_response(query, model_option):
19
+ embedder = SentenceTransformer(model_option)
20
+ question_embedding = embedder.encode(query, convert_to_tensor=True)
21
+ corpus_embeddings = torch.load(f"embeddings/embeddings_{model_option}.pt", map_location=torch.device('cpu'))
22
+ hits = util.semantic_search(question_embedding, corpus_embeddings, top_k=5)
23
+ article_data_list = []
24
+ data_list = []
25
+ for hit in hits[0]:
26
+ hit_id = hit['corpus_id']
27
+ article_data = df.iloc[hit_id]
28
+ #article_data_list.append(article_data["combined"])
29
+ article_data_list.append({"title": article_data["title_s"] + ". " + article_data["subTitle_s"],
30
+ "date": article_data["producedDate_s"],
31
+ "journal" : article_data["journalTitle_s"],
32
+ "pub": article_data["journalPublisher_s"],
33
+ "abstract": article_data["abstract_s"]
34
+ })
35
+ return article_data_list
36
 
37
+ models = ['all-MiniLM-L6-v2', 'all-mpnet-base-v2']
38
+ model_option = st.sidebar.selectbox("Choose the open embeddings model to use ?", models)
39
+
40
+ #OPENAI_API_KEY = st.sidebar.text_input(":green[Optional : Enter your OPENAi API KEY here :]")
41
+
42
+ with st.container():
43
+ if query := st.text_input(
44
+ ":green[Enter your question :]"):
45
+ st.markdown(f"### :blue[{model_option} results]")
46
+ with st.spinner('Calculating...'):
47
+ response = huggingface_response(query, model_option)
48
+ for x in response:
49
+ st.success("**Title** : " + x["title"] + " \n " + "**Date** : " + x["date"] + " \n " + "**Journal** : " + x["journal"] + "(" + x["pub"] + ")" + " \n " + "**Abstract** : " + x["abstract"])
50
+ #st.write(huggingface_response(query, model_option))
51
+ #for x in response:
52
+ # st.write(x["titre"] + "\n" + x["date"] + "\n" + "------" + "\n")
53
+ st.markdown("### HAL search engine results")
54
+ components.iframe(f"https://hal.univ-cotedazur.fr/search/index/?q={query}&rows=30&docType_s=ART", height=800, scrolling=True)
data/hal_articles.csv ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b0e090cb11c263b57f1c980e962c255b4ebeecd8405cd6bb7898b05bc0f20445
3
+ size 55308538
embeddings/embeddings_all-MiniLM-L6-v2.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2eed5d5df2835ca6139be11a84c8fc728befaa4804e0311b1f3db8816f7a84c2
3
+ size 34725223
embeddings/embeddings_all-mpnet-base-v2.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:02478d4a23588730d3a4cb9a51af539bf2b5223e648d7bd9ba3b7c7a0ea0a51f
3
+ size 69449578
pages/documentation.py CHANGED
@@ -10,35 +10,60 @@ st.header("Dataset creation")
10
 
11
  st.subheader(":blue[HAL API harvest]")
12
 
 
13
  st.write("All the API documentation is available [here](https://api.archives-ouvertes.fr/docs/search)")
14
- st.write("All records of article type publications reported in the UNIV-COTEDAZUR collection of HAL are obtained with this recursive function that populates a pandas Dataframe as output ")
15
  st.code("""
16
- global_list = []
17
- def recursive_hal_harvest(cursor="*"):
18
- url = f"https://api.archives-ouvertes.fr/search/UNIV-COTEDAZUR/?q=docType_s:ART&rows=1000&cursorMark={cursor}&fl=uri_s,title_s,subTitle_s,authFullName_s,producedDate_s,domain_t,journalTitle_s,journalPublisher_s,anrProjectCallTitle_s,abstract_s&sort=docid asc"
19
- print(url)
20
- response = requests.request("GET", url).text
21
- data = json.loads(response)
22
- for doc in data["response"]["docs"]:
23
- global_list.append(doc)
24
- if len(data["response"]["docs"]) != 0:
25
- return recursive_hal_harvest(cursor=data["nextCursorMark"])
26
- else:
27
- return global_list
28
- df = pd.DataFrame(recursive_hal_harvest())
29
 
 
 
 
 
 
 
 
 
 
 
 
 
30
  """, language='python')
31
 
32
  st.write("The dataframe's colmumns of metadata are then concatenated into a single combined text in a new column. It is therefore on this new column that the different embeddings models will be applied to encode this combined text and output a single vector embedding.")
33
  st.code("""
34
  df = df.astype(str)
35
  df["combined"] = (
36
- "Title: " + df.title_s + ";Subtitle:" + df.subTitle_s + ";Author:" + df.authFullName_s + ";Date:" + df.producedDate_s + ";Journal Title:" + df.journalTitle_s + ";Publisher:" + df.journalPublisher_s + ";ANR Project:" + df.anrProjectCallTitle_s + "; Abstract: " + df.abstract_s
37
- )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
38
 
39
  """, language='python')
40
 
41
- st.subheader(":blue[OpenAI Embeddings]")
 
 
42
 
43
  st.code("""
44
  import openai
@@ -57,12 +82,28 @@ encoding = tiktoken.get_encoding(embedding_encoding)
57
  df["n_tokens"] = df.combined.apply(lambda x: len(encoding.encode(x)))
58
  df = df[df.n_tokens <= max_tokens]
59
 
60
- # générate embeddings
61
- df["openai_embedding"] = df.combined.apply(lambda x: get_embedding(x, engine=embedding_model) )
62
- df["openai_embedding"] = df.embedding.astype(str).apply(eval).apply(np.array)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
63
 
64
  """, language='python')
65
 
66
- st.subheader(":blue[Huggingface free models for Embeddings]")
67
 
68
- st.write("The open source Huggingface platform hosts a large number of pre-trained models that can then be reused for many tasks (text or image classification, summarization, document QA etc...). We can then use the popular sentence-transformers library applied on free available text embedding models for creating embeddings ")
 
10
 
11
  st.subheader(":blue[HAL API harvest]")
12
 
13
+ st.write("HAL is the french national open archive for scientific publications based on the principles of open access and self-archiving.")
14
  st.write("All the API documentation is available [here](https://api.archives-ouvertes.fr/docs/search)")
15
+ st.write("All records of article type publications reported in the UNIV-COTEDAZUR collection of HAL are obtained with this looping function that populates a pandas Dataframe as output ")
16
  st.code("""
17
+ # we retrieve first the total number of records
18
+ url_for_total_count = "https://api.archives-ouvertes.fr/search/UNIV-COTEDAZUR/?q=docType_s:ART&rows=0"
19
+ response = requests.request("GET", url_for_total_count).text
20
+ data = json.loads(response)
21
+ total_count = data["response"]["numFound"]
 
 
 
 
 
 
 
 
22
 
23
+ """, language='python')
24
+ st.code("""
25
+ step = 1000
26
+ df = []
27
+ for i in range(1, int(total_count), int(step)):
28
+ url = f"https://api.archives-ouvertes.fr/search/UNIV-COTEDAZUR/?q=docType_s:ART&rows={step}&start={i}&wt=csv&fl=uri_s,title_s,subTitle_s,authFullName_s,producedDate_s,domain_t,journalTitle_s,journalPublisher_s,abstract_s"
29
+ data = pd.read_csv(url, encoding="utf-8")
30
+ df.append(data)
31
+ df = pd.concat(df)
32
+ # clean up a little bit
33
+ df = df.drop_duplicates(subset=['uri_s'])
34
+ df = df.replace(np.nan, '')
35
  """, language='python')
36
 
37
  st.write("The dataframe's colmumns of metadata are then concatenated into a single combined text in a new column. It is therefore on this new column that the different embeddings models will be applied to encode this combined text and output a single vector embedding.")
38
  st.code("""
39
  df = df.astype(str)
40
  df["combined"] = (
41
+ "Title: " + df.title_s + ". " + df.subTitle_s + ". Authors :" + df.authFullName_s + ". Publication date :" + df.producedDate_s + ". Journal title :" + df.journalTitle_s + ". Publisher :" + df.journalPublisher_s + ". Astract : " + df.abstract_s
42
+
43
+ """, language='python')
44
+
45
+ st.subheader(":blue[Huggingface open models for Embeddings]")
46
+
47
+ st.write("The open source Huggingface platform hosts a large number of pre-trained models that can then be reused for many tasks (text or image classification, summarization, document QA etc...). We can then use the sentence-transformers library applied on some of these available embedding pre-trained models for creating embeddings.")
48
+ st.write("There is two ways of working with the Huggingface hosted models : by using the [inference API endpoint](https://huggingface.co/inference-api) or by locally importing the model. Here we choose the second way")
49
+ st.write("Two open source transformers-based models have been used to convert the textual metadata into numerical vector representation, which generated two vector embeddings datasets : embeddings_all-MiniLM-L6-v2.pt and embeddings_multi-qa-mpnet-base-dot-v1.pt")
50
+ st.code("""
51
+ import torch
52
+ from sentence_transformers import SentenceTransformer
53
+
54
+ embedder = SentenceTransformer('all-MiniLM-L6-v2') # or 'multi-qa-mpnet-base-dot-v1'
55
+
56
+ corpus_embeddings = embedder.encode(df.combined, convert_to_tensor=True)
57
+
58
+ # how to save and reload
59
+ torch.save(corpus_embeddings, f"{LOCAL_PATH}/embeddings_all-MiniLM-L6-v2.pt")
60
+ corpus_embeddings = torch.load(f"{LOCAL_PATH}/embeddings_all-MiniLM-L6-v2.pt")
61
 
62
  """, language='python')
63
 
64
+ st.subheader(":blue[Bonus : OpenAI Embeddings]")
65
+
66
+ st.write("If you want to do the same with text-embedding-ada-002 (the OpenAI embeddings model)")
67
 
68
  st.code("""
69
  import openai
 
82
  df["n_tokens"] = df.combined.apply(lambda x: len(encoding.encode(x)))
83
  df = df[df.n_tokens <= max_tokens]
84
 
85
+ # generate embeddings
86
+ def custom_get_embedding(text: str) -> list[float]:
87
+ return openai.Embedding.create(input=[text], model="text-embedding-ada-002")["data"][0]["embedding"]
88
+ df["openai_embedding"] = df.combined.apply(lambda x: custom_get_embedding(x) )
89
+
90
+ """, language='python')
91
+
92
+ st.write("And the Steamlit UI code would be :")
93
+
94
+ st.code("""
95
+ df["openai_embedding"] = df.openai_embedding.apply(literal_eval).apply(np.array)
96
+ def custom_get_embedding(text: str) -> list[float]:
97
+ return openai.Embedding.create(input=[text], model="text-embedding-ada-002", openai_api_key=OPENAI_API_KEY)["data"][0]["embedding"]
98
+ def openai_response(query):
99
+ query_embedding = np.array(custom_get_embedding(
100
+ query
101
+ ))
102
+ df["similarity"] = df.openai_embedding.apply(lambda x: cosine_similarity(x, query_embedding))
103
+ return df.sort_values("similarity", ascending=False).head(5).to_json(orient="records")
104
 
105
  """, language='python')
106
 
107
+ st.header("Dataset hosting")
108
 
109
+ st.write("The csv file of the dataset is avalaible in the data folder")
requirements.txt CHANGED
@@ -12,6 +12,7 @@ colorama==0.4.6
12
  contourpy==1.1.0
13
  cycler==0.11.0
14
  decorator==5.1.1
 
15
  filelock==3.12.2
16
  fonttools==4.41.0
17
  frozenlist==1.4.0
@@ -32,10 +33,10 @@ matplotlib==3.7.2
32
  mdurl==0.1.2
33
  mpmath==1.3.0
34
  multidict==6.0.4
 
35
  networkx==3.1
36
  nltk==3.8.1
37
  numpy==1.25.1
38
- openai==0.27.8
39
  packaging==23.1
40
  pandas==2.0.3
41
  Pillow==9.5.0
@@ -62,7 +63,7 @@ sentence-transformers==2.2.2
62
  sentencepiece==0.1.99
63
  six==1.16.0
64
  smmap==5.0.0
65
- streamlit==1.24.1
66
  sympy==1.12
67
  tenacity==8.2.2
68
  threadpoolctl==3.2.0
@@ -81,5 +82,6 @@ tzlocal==4.3.1
81
  urllib3==2.0.3
82
  validators==0.20.0
83
  watchdog==3.0.0
 
84
  yarl==1.9.2
85
  zipp==3.16.2
 
12
  contourpy==1.1.0
13
  cycler==0.11.0
14
  decorator==5.1.1
15
+ dill==0.3.6
16
  filelock==3.12.2
17
  fonttools==4.41.0
18
  frozenlist==1.4.0
 
33
  mdurl==0.1.2
34
  mpmath==1.3.0
35
  multidict==6.0.4
36
+ multiprocess==0.70.14
37
  networkx==3.1
38
  nltk==3.8.1
39
  numpy==1.25.1
 
40
  packaging==23.1
41
  pandas==2.0.3
42
  Pillow==9.5.0
 
63
  sentencepiece==0.1.99
64
  six==1.16.0
65
  smmap==5.0.0
66
+ streamlit==1.25.0
67
  sympy==1.12
68
  tenacity==8.2.2
69
  threadpoolctl==3.2.0
 
82
  urllib3==2.0.3
83
  validators==0.20.0
84
  watchdog==3.0.0
85
+ xxhash==3.2.0
86
  yarl==1.9.2
87
  zipp==3.16.2