Spaces:

Geraldine
/

HAL-UNIV-COTEDAZUR_semantic_search

Sleeping

App Files Files Community

Geraldine commited on Jul 24, 2023

Commit

1688a82

1 Parent(s): 42cb173

Upload 6 files

Browse files

Files changed (7) hide show

.gitattributes +1 -0
app.py +44 -19
data/hal_articles.csv +3 -0
embeddings/embeddings_all-MiniLM-L6-v2.pt +3 -0
embeddings/embeddings_all-mpnet-base-v2.pt +3 -0
pages/documentation.py +63 -22
requirements.txt +4 -2

.gitattributes CHANGED Viewed

@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+data/hal_articles.csv filter=lfs diff=lfs merge=lfs -text

app.py CHANGED Viewed

@@ -1,29 +1,54 @@
 import streamlit as st
-import openai
-import tiktoken
-from openai.embeddings_utils import get_embedding, cosine_similarity
 import torch
 from sentence_transformers import SentenceTransformer, util
 # Set Streamlit page configuration
 st.set_page_config(page_title="App", layout="wide")
-st.title("HAL UNIV-COTEDAZUR Collection Semantic Search")
-model_options = st.sidebar.selectbox("Choose the free embeddings model to use ?", ('all-MiniLM-L6-v2', 'multi-qa-mpnet-base-dot-v1'))
-if ok_openai := st.sidebar.checkbox('You want to use the OpenAI Embeddings too'):
-    if openai_api_key := st.text_input(
-        ":blue[Put Your OPENAI API-KEY :]",
-        placeholder="Paste your OpenAI API key here ",
-        type="password",
-    ):
-        st.success("great")
-col1, col2 = st.columns(2)
-if query := st.text_input(
-        ":orange[Enter your question :]"):
-     with col1:
-         st.write("OpenAI place")
-     with col2:
-         st.write("Open LLM place")

 import streamlit as st
+import streamlit.components.v1 as components
+import pandas as pd
+import numpy as np
 import torch
 from sentence_transformers import SentenceTransformer, util
 # Set Streamlit page configuration
 st.set_page_config(page_title="App", layout="wide")
+st.title("Semantic Search on HAL UNIV-COTEDAZUR Collection (articles)")
+with st.spinner('Loading dataset...'):
+    df = pd.read_csv("data/hal_articles.csv", sep=",", encoding="utf-8")
+    df = df.replace(np.nan, '')
+    df = df.astype(str)
+def huggingface_response(query, model_option):
+    embedder = SentenceTransformer(model_option)
+    question_embedding = embedder.encode(query, convert_to_tensor=True)
+    corpus_embeddings = torch.load(f"embeddings/embeddings_{model_option}.pt", map_location=torch.device('cpu'))
+    hits = util.semantic_search(question_embedding, corpus_embeddings, top_k=5)
+    article_data_list = []
+    data_list = []
+    for hit in hits[0]:
+        hit_id = hit['corpus_id']
+        article_data = df.iloc[hit_id]
+        #article_data_list.append(article_data["combined"])
+        article_data_list.append({"title": article_data["title_s"] + ". " + article_data["subTitle_s"],
+                                  "date": article_data["producedDate_s"],
+                                  "journal" : article_data["journalTitle_s"],
+                                  "pub": article_data["journalPublisher_s"],
+                                  "abstract": article_data["abstract_s"]
+                                  })
+    return article_data_list
+models = ['all-MiniLM-L6-v2', 'all-mpnet-base-v2']
+model_option = st.sidebar.selectbox("Choose the open embeddings model to use ?", models)
+#OPENAI_API_KEY = st.sidebar.text_input(":green[Optional : Enter your OPENAi API KEY here :]")
+with st.container():
+    if query := st.text_input(
+        ":green[Enter your question :]"):
+        st.markdown(f"### :blue[{model_option} results]")
+        with st.spinner('Calculating...'):
+            response = huggingface_response(query, model_option)
+            for x in response:
+                st.success("**Title** : " + x["title"] + "  \n  " + "**Date** : " + x["date"] + "  \n  " + "**Journal** : " + x["journal"] + "(" + x["pub"] + ")" + "  \n  " + "**Abstract** : " + x["abstract"])
+            #st.write(huggingface_response(query, model_option))
+            #for x in response:
+            #    st.write(x["titre"] + "\n" + x["date"] + "\n" + "------" + "\n")
+        st.markdown("### HAL search engine results")
+        components.iframe(f"https://hal.univ-cotedazur.fr/search/index/?q={query}&rows=30&docType_s=ART", height=800, scrolling=True)

data/hal_articles.csv ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:b0e090cb11c263b57f1c980e962c255b4ebeecd8405cd6bb7898b05bc0f20445
+size 55308538

embeddings/embeddings_all-MiniLM-L6-v2.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:2eed5d5df2835ca6139be11a84c8fc728befaa4804e0311b1f3db8816f7a84c2
+size 34725223

embeddings/embeddings_all-mpnet-base-v2.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:02478d4a23588730d3a4cb9a51af539bf2b5223e648d7bd9ba3b7c7a0ea0a51f
+size 69449578

pages/documentation.py CHANGED Viewed

@@ -10,35 +10,60 @@ st.header("Dataset creation")
 st.subheader(":blue[HAL API harvest]")
 st.write("All the API documentation is available [here](https://api.archives-ouvertes.fr/docs/search)")
-st.write("All records of article type publications reported in the UNIV-COTEDAZUR collection of HAL are obtained with this recursive function that populates a pandas Dataframe as output ")
 st.code("""
-global_list = []
-def recursive_hal_harvest(cursor="*"):
-  url = f"https://api.archives-ouvertes.fr/search/UNIV-COTEDAZUR/?q=docType_s:ART&rows=1000&cursorMark={cursor}&fl=uri_s,title_s,subTitle_s,authFullName_s,producedDate_s,domain_t,journalTitle_s,journalPublisher_s,anrProjectCallTitle_s,abstract_s&sort=docid asc"
-  print(url)
-  response = requests.request("GET", url).text
-  data = json.loads(response)
-  for doc in data["response"]["docs"]:
-    global_list.append(doc)
-  if len(data["response"]["docs"]) != 0:
-    return recursive_hal_harvest(cursor=data["nextCursorMark"])
-  else:
-    return global_list
-df = pd.DataFrame(recursive_hal_harvest())
 """, language='python')
 st.write("The dataframe's colmumns of metadata are then concatenated into a single combined text in a new column. It is therefore on this new column that the different embeddings models will be applied to encode this combined text and output a single vector embedding.")
 st.code("""
 df = df.astype(str)
 df["combined"] = (
-    "Title: " + df.title_s + ";Subtitle:" + df.subTitle_s + ";Author:" + df.authFullName_s + ";Date:" + df.producedDate_s + ";Journal Title:" + df.journalTitle_s + ";Publisher:" + df.journalPublisher_s + ";ANR Project:" + df.anrProjectCallTitle_s + "; Abstract: " + df.abstract_s
-)
 """, language='python')
-st.subheader(":blue[OpenAI Embeddings]")
 st.code("""
 import openai
@@ -57,12 +82,28 @@ encoding = tiktoken.get_encoding(embedding_encoding)
 df["n_tokens"] = df.combined.apply(lambda x: len(encoding.encode(x)))
 df = df[df.n_tokens <= max_tokens]
-# générate embeddings
-df["openai_embedding"] = df.combined.apply(lambda x: get_embedding(x, engine=embedding_model) )
-df["openai_embedding"] = df.embedding.astype(str).apply(eval).apply(np.array)
 """, language='python')
-st.subheader(":blue[Huggingface free models for Embeddings]")
-st.write("The open source Huggingface platform hosts a large number of pre-trained models that can then be reused for many tasks (text or image classification, summarization, document QA etc...). We can then use the popular sentence-transformers library applied on free available text embedding models for creating embeddings ")

 st.subheader(":blue[HAL API harvest]")
+st.write("HAL is the french national open archive for scientific publications based on the principles of open access and self-archiving.")
 st.write("All the API documentation is available [here](https://api.archives-ouvertes.fr/docs/search)")
+st.write("All records of article type publications reported in the UNIV-COTEDAZUR collection of HAL are obtained with this looping function that populates a pandas Dataframe as output ")
 st.code("""
+# we retrieve first the total number of records
+url_for_total_count = "https://api.archives-ouvertes.fr/search/UNIV-COTEDAZUR/?q=docType_s:ART&rows=0"
+response = requests.request("GET", url_for_total_count).text
+data = json.loads(response)
+total_count = data["response"]["numFound"]
+""", language='python')
+st.code("""
+step = 1000
+df = []
+for i in range(1, int(total_count), int(step)):
+  url = f"https://api.archives-ouvertes.fr/search/UNIV-COTEDAZUR/?q=docType_s:ART&rows={step}&start={i}&wt=csv&fl=uri_s,title_s,subTitle_s,authFullName_s,producedDate_s,domain_t,journalTitle_s,journalPublisher_s,abstract_s"
+  data = pd.read_csv(url, encoding="utf-8")
+  df.append(data)
+df = pd.concat(df)
+# clean up a little bit
+df = df.drop_duplicates(subset=['uri_s'])
+df = df.replace(np.nan, '')
 """, language='python')
 st.write("The dataframe's colmumns of metadata are then concatenated into a single combined text in a new column. It is therefore on this new column that the different embeddings models will be applied to encode this combined text and output a single vector embedding.")
 st.code("""
 df = df.astype(str)
 df["combined"] = (
+    "Title: " + df.title_s + ". " + df.subTitle_s + ". Authors :" + df.authFullName_s + ". Publication date :" + df.producedDate_s + ". Journal title :" + df.journalTitle_s + ". Publisher :" + df.journalPublisher_s + ". Astract : " + df.abstract_s
+""", language='python')
+st.subheader(":blue[Huggingface open models for Embeddings]")
+st.write("The open source Huggingface platform hosts a large number of pre-trained models that can then be reused for many tasks (text or image classification, summarization, document QA etc...). We can then use the sentence-transformers library applied on some of these available embedding pre-trained models for creating embeddings.")
+st.write("There is two ways of working with the Huggingface hosted models : by using the [inference API endpoint](https://huggingface.co/inference-api) or by locally importing the model. Here we choose the second way")
+st.write("Two open source transformers-based models have been used to convert the textual metadata into numerical vector representation, which generated two vector embeddings datasets : embeddings_all-MiniLM-L6-v2.pt and embeddings_multi-qa-mpnet-base-dot-v1.pt")
+st.code("""
+import torch
+from sentence_transformers import SentenceTransformer
+embedder = SentenceTransformer('all-MiniLM-L6-v2') # or 'multi-qa-mpnet-base-dot-v1'
+corpus_embeddings = embedder.encode(df.combined, convert_to_tensor=True)
+# how to save and reload
+torch.save(corpus_embeddings, f"{LOCAL_PATH}/embeddings_all-MiniLM-L6-v2.pt")
+corpus_embeddings = torch.load(f"{LOCAL_PATH}/embeddings_all-MiniLM-L6-v2.pt")
 """, language='python')
+st.subheader(":blue[Bonus : OpenAI Embeddings]")
+st.write("If you want to do the same with text-embedding-ada-002 (the OpenAI embeddings model)")
 st.code("""
 import openai
 df["n_tokens"] = df.combined.apply(lambda x: len(encoding.encode(x)))
 df = df[df.n_tokens <= max_tokens]
+# generate embeddings
+def custom_get_embedding(text: str) -> list[float]:
+    return openai.Embedding.create(input=[text], model="text-embedding-ada-002")["data"][0]["embedding"]
+df["openai_embedding"] = df.combined.apply(lambda x: custom_get_embedding(x) )
+""", language='python')
+st.write("And the Steamlit UI code would be :")
+st.code("""
+df["openai_embedding"] = df.openai_embedding.apply(literal_eval).apply(np.array)
+def custom_get_embedding(text: str) -> list[float]:
+    return openai.Embedding.create(input=[text], model="text-embedding-ada-002", openai_api_key=OPENAI_API_KEY)["data"][0]["embedding"]
+def openai_response(query):
+    query_embedding = np.array(custom_get_embedding(
+        query
+    ))
+    df["similarity"] = df.openai_embedding.apply(lambda x: cosine_similarity(x, query_embedding))
+    return df.sort_values("similarity", ascending=False).head(5).to_json(orient="records")
 """, language='python')
+st.header("Dataset hosting")
+st.write("The csv file of the dataset is avalaible in the data folder")

requirements.txt CHANGED Viewed

@@ -12,6 +12,7 @@ colorama==0.4.6
 contourpy==1.1.0
 cycler==0.11.0
 decorator==5.1.1
 filelock==3.12.2
 fonttools==4.41.0
 frozenlist==1.4.0
@@ -32,10 +33,10 @@ matplotlib==3.7.2
 mdurl==0.1.2
 mpmath==1.3.0
 multidict==6.0.4
 networkx==3.1
 nltk==3.8.1
 numpy==1.25.1
-openai==0.27.8
 packaging==23.1
 pandas==2.0.3
 Pillow==9.5.0
@@ -62,7 +63,7 @@ sentence-transformers==2.2.2
 sentencepiece==0.1.99
 six==1.16.0
 smmap==5.0.0
-streamlit==1.24.1
 sympy==1.12
 tenacity==8.2.2
 threadpoolctl==3.2.0
@@ -81,5 +82,6 @@ tzlocal==4.3.1
 urllib3==2.0.3
 validators==0.20.0
 watchdog==3.0.0
 yarl==1.9.2
 zipp==3.16.2

 contourpy==1.1.0
 cycler==0.11.0
 decorator==5.1.1
+dill==0.3.6
 filelock==3.12.2
 fonttools==4.41.0
 frozenlist==1.4.0
 mdurl==0.1.2
 mpmath==1.3.0
 multidict==6.0.4
+multiprocess==0.70.14
 networkx==3.1
 nltk==3.8.1
 numpy==1.25.1
 packaging==23.1
 pandas==2.0.3
 Pillow==9.5.0
 sentencepiece==0.1.99
 six==1.16.0
 smmap==5.0.0
+streamlit==1.25.0
 sympy==1.12
 tenacity==8.2.2
 threadpoolctl==3.2.0
 urllib3==2.0.3
 validators==0.20.0
 watchdog==3.0.0
+xxhash==3.2.0
 yarl==1.9.2
 zipp==3.16.2