Upload 3 files
Browse files- app.py +3 -3
- pages/documentation.py +68 -0
- requirements.txt +1 -0
app.py
CHANGED
@@ -6,13 +6,13 @@ import torch
|
|
6 |
from sentence_transformers import SentenceTransformer, util
|
7 |
|
8 |
# Set Streamlit page configuration
|
9 |
-
st.set_page_config(page_title="
|
10 |
|
11 |
-
st.title("
|
12 |
|
13 |
model_options = st.sidebar.selectbox("Choose the free embeddings model to use ?", ('all-MiniLM-L6-v2', 'multi-qa-mpnet-base-dot-v1'))
|
14 |
|
15 |
-
if ok_openai := st.checkbox('You want to use the OpenAI Embeddings'):
|
16 |
if openai_api_key := st.text_input(
|
17 |
":blue[Put Your OPENAI API-KEY :]",
|
18 |
placeholder="Paste your OpenAI API key here ",
|
|
|
6 |
from sentence_transformers import SentenceTransformer, util
|
7 |
|
8 |
# Set Streamlit page configuration
|
9 |
+
st.set_page_config(page_title="App", layout="wide")
|
10 |
|
11 |
+
st.title("HAL UNIV-COTEDAZUR Collection Semantic Search")
|
12 |
|
13 |
model_options = st.sidebar.selectbox("Choose the free embeddings model to use ?", ('all-MiniLM-L6-v2', 'multi-qa-mpnet-base-dot-v1'))
|
14 |
|
15 |
+
if ok_openai := st.sidebar.checkbox('You want to use the OpenAI Embeddings too'):
|
16 |
if openai_api_key := st.text_input(
|
17 |
":blue[Put Your OPENAI API-KEY :]",
|
18 |
placeholder="Paste your OpenAI API key here ",
|
pages/documentation.py
ADDED
@@ -0,0 +1,68 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import streamlit as st
|
2 |
+
|
3 |
+
# Set Streamlit page configuration
|
4 |
+
st.set_page_config(page_title="Documentation", layout="wide")
|
5 |
+
|
6 |
+
# Set up the Streamlit app layout
|
7 |
+
st.title("Documentation")
|
8 |
+
|
9 |
+
st.header("Dataset creation")
|
10 |
+
|
11 |
+
st.subheader(":blue[HAL API harvest]")
|
12 |
+
|
13 |
+
st.write("All the API documentation is available [here](https://api.archives-ouvertes.fr/docs/search)")
|
14 |
+
st.write("All records of article type publications reported in the UNIV-COTEDAZUR collection of HAL are obtained with this recursive function that populates a pandas Dataframe as output ")
|
15 |
+
st.code("""
|
16 |
+
global_list = []
|
17 |
+
def recursive_hal_harvest(cursor="*"):
|
18 |
+
url = f"https://api.archives-ouvertes.fr/search/UNIV-COTEDAZUR/?q=docType_s:ART&rows=1000&cursorMark={cursor}&fl=uri_s,title_s,subTitle_s,authFullName_s,producedDate_s,domain_t,journalTitle_s,journalPublisher_s,anrProjectCallTitle_s,abstract_s&sort=docid asc"
|
19 |
+
print(url)
|
20 |
+
response = requests.request("GET", url).text
|
21 |
+
data = json.loads(response)
|
22 |
+
for doc in data["response"]["docs"]:
|
23 |
+
global_list.append(doc)
|
24 |
+
if len(data["response"]["docs"]) != 0:
|
25 |
+
return recursive_hal_harvest(cursor=data["nextCursorMark"])
|
26 |
+
else:
|
27 |
+
return global_list
|
28 |
+
df = pd.DataFrame(recursive_hal_harvest())
|
29 |
+
|
30 |
+
""", language='python')
|
31 |
+
|
32 |
+
st.write("The dataframe's colmumns of metadata are then concatenated into a single combined text in a new column. It is therefore on this new column that the different embeddings models will be applied to encode this combined text and output a single vector embedding.")
|
33 |
+
st.code("""
|
34 |
+
df = df.astype(str)
|
35 |
+
df["combined"] = (
|
36 |
+
"Title: " + df.title_s + ";Subtitle:" + df.subTitle_s + ";Author:" + df.authFullName_s + ";Date:" + df.producedDate_s + ";Journal Title:" + df.journalTitle_s + ";Publisher:" + df.journalPublisher_s + ";ANR Project:" + df.anrProjectCallTitle_s + "; Abstract: " + df.abstract_s
|
37 |
+
)
|
38 |
+
|
39 |
+
""", language='python')
|
40 |
+
|
41 |
+
st.subheader(":blue[OpenAI Embeddings]")
|
42 |
+
|
43 |
+
st.code("""
|
44 |
+
import openai
|
45 |
+
import tiktoken
|
46 |
+
from openai.embeddings_utils import get_embedding
|
47 |
+
|
48 |
+
openai.api_key = os.getenv("OPENAI_API_KEY")
|
49 |
+
|
50 |
+
# embedding model parameters
|
51 |
+
embedding_model = "text-embedding-ada-002"
|
52 |
+
embedding_encoding = "cl100k_base" # this the encoding for text-embedding-ada-002
|
53 |
+
max_tokens = 8000 # the maximum for text-embedding-ada-002 is 8191
|
54 |
+
|
55 |
+
# filtering dataset on text under the max tokens limit
|
56 |
+
encoding = tiktoken.get_encoding(embedding_encoding)
|
57 |
+
df["n_tokens"] = df.combined.apply(lambda x: len(encoding.encode(x)))
|
58 |
+
df = df[df.n_tokens <= max_tokens]
|
59 |
+
|
60 |
+
# générate embeddings
|
61 |
+
df["openai_embedding"] = df.combined.apply(lambda x: get_embedding(x, engine=embedding_model) )
|
62 |
+
df["openai_embedding"] = df.embedding.astype(str).apply(eval).apply(np.array)
|
63 |
+
|
64 |
+
""", language='python')
|
65 |
+
|
66 |
+
st.subheader(":blue[Huggingface free models for Embeddings]")
|
67 |
+
|
68 |
+
st.write("The open source Huggingface platform hosts a large number of pre-trained models that can then be reused for many tasks (text or image classification, summarization, document QA etc...). We can then use the popular sentence-transformers library applied on free available text embedding models for creating embeddings ")
|
requirements.txt
CHANGED
@@ -39,6 +39,7 @@ openai==0.27.8
|
|
39 |
packaging==23.1
|
40 |
pandas==2.0.3
|
41 |
Pillow==9.5.0
|
|
|
42 |
protobuf==4.23.4
|
43 |
pyarrow==12.0.1
|
44 |
pydeck==0.8.1b0
|
|
|
39 |
packaging==23.1
|
40 |
pandas==2.0.3
|
41 |
Pillow==9.5.0
|
42 |
+
plotly==5.15.0
|
43 |
protobuf==4.23.4
|
44 |
pyarrow==12.0.1
|
45 |
pydeck==0.8.1b0
|