Geraldine commited on
Commit
01f708a
1 Parent(s): 018a0a9

Upload 3 files

Browse files
Files changed (3) hide show
  1. app.py +3 -3
  2. pages/documentation.py +68 -0
  3. requirements.txt +1 -0
app.py CHANGED
@@ -6,13 +6,13 @@ import torch
6
  from sentence_transformers import SentenceTransformer, util
7
 
8
  # Set Streamlit page configuration
9
- st.set_page_config(page_title="home", layout="wide")
10
 
11
- st.title("Streamlit App around GPT")
12
 
13
  model_options = st.sidebar.selectbox("Choose the free embeddings model to use ?", ('all-MiniLM-L6-v2', 'multi-qa-mpnet-base-dot-v1'))
14
 
15
- if ok_openai := st.checkbox('You want to use the OpenAI Embeddings'):
16
  if openai_api_key := st.text_input(
17
  ":blue[Put Your OPENAI API-KEY :]",
18
  placeholder="Paste your OpenAI API key here ",
 
6
  from sentence_transformers import SentenceTransformer, util
7
 
8
  # Set Streamlit page configuration
9
+ st.set_page_config(page_title="App", layout="wide")
10
 
11
+ st.title("HAL UNIV-COTEDAZUR Collection Semantic Search")
12
 
13
  model_options = st.sidebar.selectbox("Choose the free embeddings model to use ?", ('all-MiniLM-L6-v2', 'multi-qa-mpnet-base-dot-v1'))
14
 
15
+ if ok_openai := st.sidebar.checkbox('You want to use the OpenAI Embeddings too'):
16
  if openai_api_key := st.text_input(
17
  ":blue[Put Your OPENAI API-KEY :]",
18
  placeholder="Paste your OpenAI API key here ",
pages/documentation.py ADDED
@@ -0,0 +1,68 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+
3
+ # Set Streamlit page configuration
4
+ st.set_page_config(page_title="Documentation", layout="wide")
5
+
6
+ # Set up the Streamlit app layout
7
+ st.title("Documentation")
8
+
9
+ st.header("Dataset creation")
10
+
11
+ st.subheader(":blue[HAL API harvest]")
12
+
13
+ st.write("All the API documentation is available [here](https://api.archives-ouvertes.fr/docs/search)")
14
+ st.write("All records of article type publications reported in the UNIV-COTEDAZUR collection of HAL are obtained with this recursive function that populates a pandas Dataframe as output ")
15
+ st.code("""
16
+ global_list = []
17
+ def recursive_hal_harvest(cursor="*"):
18
+ url = f"https://api.archives-ouvertes.fr/search/UNIV-COTEDAZUR/?q=docType_s:ART&rows=1000&cursorMark={cursor}&fl=uri_s,title_s,subTitle_s,authFullName_s,producedDate_s,domain_t,journalTitle_s,journalPublisher_s,anrProjectCallTitle_s,abstract_s&sort=docid asc"
19
+ print(url)
20
+ response = requests.request("GET", url).text
21
+ data = json.loads(response)
22
+ for doc in data["response"]["docs"]:
23
+ global_list.append(doc)
24
+ if len(data["response"]["docs"]) != 0:
25
+ return recursive_hal_harvest(cursor=data["nextCursorMark"])
26
+ else:
27
+ return global_list
28
+ df = pd.DataFrame(recursive_hal_harvest())
29
+
30
+ """, language='python')
31
+
32
+ st.write("The dataframe's colmumns of metadata are then concatenated into a single combined text in a new column. It is therefore on this new column that the different embeddings models will be applied to encode this combined text and output a single vector embedding.")
33
+ st.code("""
34
+ df = df.astype(str)
35
+ df["combined"] = (
36
+ "Title: " + df.title_s + ";Subtitle:" + df.subTitle_s + ";Author:" + df.authFullName_s + ";Date:" + df.producedDate_s + ";Journal Title:" + df.journalTitle_s + ";Publisher:" + df.journalPublisher_s + ";ANR Project:" + df.anrProjectCallTitle_s + "; Abstract: " + df.abstract_s
37
+ )
38
+
39
+ """, language='python')
40
+
41
+ st.subheader(":blue[OpenAI Embeddings]")
42
+
43
+ st.code("""
44
+ import openai
45
+ import tiktoken
46
+ from openai.embeddings_utils import get_embedding
47
+
48
+ openai.api_key = os.getenv("OPENAI_API_KEY")
49
+
50
+ # embedding model parameters
51
+ embedding_model = "text-embedding-ada-002"
52
+ embedding_encoding = "cl100k_base" # this the encoding for text-embedding-ada-002
53
+ max_tokens = 8000 # the maximum for text-embedding-ada-002 is 8191
54
+
55
+ # filtering dataset on text under the max tokens limit
56
+ encoding = tiktoken.get_encoding(embedding_encoding)
57
+ df["n_tokens"] = df.combined.apply(lambda x: len(encoding.encode(x)))
58
+ df = df[df.n_tokens <= max_tokens]
59
+
60
+ # générate embeddings
61
+ df["openai_embedding"] = df.combined.apply(lambda x: get_embedding(x, engine=embedding_model) )
62
+ df["openai_embedding"] = df.embedding.astype(str).apply(eval).apply(np.array)
63
+
64
+ """, language='python')
65
+
66
+ st.subheader(":blue[Huggingface free models for Embeddings]")
67
+
68
+ st.write("The open source Huggingface platform hosts a large number of pre-trained models that can then be reused for many tasks (text or image classification, summarization, document QA etc...). We can then use the popular sentence-transformers library applied on free available text embedding models for creating embeddings ")
requirements.txt CHANGED
@@ -39,6 +39,7 @@ openai==0.27.8
39
  packaging==23.1
40
  pandas==2.0.3
41
  Pillow==9.5.0
 
42
  protobuf==4.23.4
43
  pyarrow==12.0.1
44
  pydeck==0.8.1b0
 
39
  packaging==23.1
40
  pandas==2.0.3
41
  Pillow==9.5.0
42
+ plotly==5.15.0
43
  protobuf==4.23.4
44
  pyarrow==12.0.1
45
  pydeck==0.8.1b0