Geraldine commited on
Commit
d9859bd
1 Parent(s): 6d0c868

Delete pages

Browse files
Files changed (1) hide show
  1. pages/documentation.py +0 -107
pages/documentation.py DELETED
@@ -1,107 +0,0 @@
1
- import streamlit as st
2
-
3
- # Set Streamlit page configuration
4
- st.set_page_config(page_title="Documentation", layout="wide")
5
-
6
- # Set up the Streamlit app layout
7
- st.title("Documentation")
8
-
9
- st.header("Dataset creation")
10
-
11
- st.subheader(":blue[HAL API harvest]")
12
-
13
- st.write("HAL is the french national open archive for scientific publications based on the principles of open access and self-archiving.")
14
- st.write("All the API documentation is available [here](https://api.archives-ouvertes.fr/docs/search)")
15
- st.write("All records of article type publications reported in the UNIV-COTEDAZUR collection of HAL are obtained with this looping function that populates a pandas Dataframe as output ")
16
- st.code("""
17
- # we retrieve first the total number of records
18
- url_for_total_count = "https://api.archives-ouvertes.fr/search/UNIV-COTEDAZUR/?q=docType_s:ART&rows=0"
19
- response = requests.request("GET", url_for_total_count).text
20
- data = json.loads(response)
21
- total_count = data["response"]["numFound"]
22
-
23
- """, language='python')
24
- st.code("""
25
- step = 1000
26
- df = []
27
- for i in range(1, int(total_count), int(step)):
28
- url = f"https://api.archives-ouvertes.fr/search/UNIV-COTEDAZUR/?q=docType_s:ART&rows={step}&start={i}&wt=csv&fl=uri_s,title_s,subTitle_s,authFullName_s,producedDate_s,domain_t,journalTitle_s,journalPublisher_s,abstract_s"
29
- data = pd.read_csv(url, encoding="utf-8")
30
- df.append(data)
31
- df = pd.concat(df)
32
- # clean up a little bit
33
- df = df.drop_duplicates(subset=['uri_s'])
34
- df = df.replace(np.nan, '')
35
- """, language='python')
36
-
37
- st.write("The dataframe's colmumns of metadata are then concatenated into a single combined text in a new column. It is therefore on this new column that the different embeddings models will be applied to encode this combined text and output a single vector embedding.")
38
- st.code("""
39
- df = df.astype(str)
40
- df["combined"] = df.title_s + ". " + df.subTitle_s + ". " +df.abstract_s
41
- """, language='python')
42
-
43
- st.subheader(":blue[Huggingface open models for Embeddings]")
44
-
45
- st.write("The open source Huggingface platform hosts a large number of pre-trained models that can then be reused for many tasks (text or image classification, summarization, document QA etc...). We can then use the sentence-transformers library applied on some of these available embedding pre-trained models for creating embeddings.")
46
- st.write("There is two ways of working with the Huggingface hosted models : by using the [inference API endpoint](https://huggingface.co/inference-api) or by locally importing the model. Here we choose the second way")
47
- st.write("Two open source transformers-based models have been used to convert the textual metadata into numerical vector representation, which generated two vector embeddings datasets : embeddings_all-MiniLM-L6-v2.pt and embeddings_multi-qa-mpnet-base-dot-v1.pt")
48
- st.code("""
49
- import torch
50
- from sentence_transformers import SentenceTransformer
51
-
52
- embedder = SentenceTransformer('all-MiniLM-L6-v2') # or 'multi-qa-mpnet-base-dot-v1'
53
-
54
- corpus_embeddings = embedder.encode(df.combined, convert_to_tensor=True)
55
-
56
- # how to save and reload
57
- torch.save(corpus_embeddings, f"{LOCAL_PATH}/embeddings_all-MiniLM-L6-v2.pt")
58
- corpus_embeddings = torch.load(f"{LOCAL_PATH}/embeddings_all-MiniLM-L6-v2.pt")
59
-
60
- """, language='python')
61
-
62
- st.subheader(":blue[Bonus : OpenAI Embeddings]")
63
-
64
- st.write("If you want to do the same with text-embedding-ada-002 (the OpenAI embeddings model)")
65
-
66
- st.code("""
67
- import openai
68
- import tiktoken
69
- from openai.embeddings_utils import get_embedding
70
-
71
- openai.api_key = os.getenv("OPENAI_API_KEY")
72
-
73
- # embedding model parameters
74
- embedding_model = "text-embedding-ada-002"
75
- embedding_encoding = "cl100k_base" # this the encoding for text-embedding-ada-002
76
- max_tokens = 8000 # the maximum for text-embedding-ada-002 is 8191
77
-
78
- # filtering dataset on text under the max tokens limit
79
- encoding = tiktoken.get_encoding(embedding_encoding)
80
- df["n_tokens"] = df.combined.apply(lambda x: len(encoding.encode(x)))
81
- df = df[df.n_tokens <= max_tokens]
82
-
83
- # generate embeddings
84
- def custom_get_embedding(text: str) -> list[float]:
85
- return openai.Embedding.create(input=[text], model="text-embedding-ada-002")["data"][0]["embedding"]
86
- df["openai_embedding"] = df.combined.apply(lambda x: custom_get_embedding(x) )
87
-
88
- """, language='python')
89
-
90
- st.write("And the Steamlit UI code would be :")
91
-
92
- st.code("""
93
- df["openai_embedding"] = df.openai_embedding.apply(literal_eval).apply(np.array)
94
- def custom_get_embedding(text: str) -> list[float]:
95
- return openai.Embedding.create(input=[text], model="text-embedding-ada-002", openai_api_key=OPENAI_API_KEY)["data"][0]["embedding"]
96
- def openai_response(query):
97
- query_embedding = np.array(custom_get_embedding(
98
- query
99
- ))
100
- df["similarity"] = df.openai_embedding.apply(lambda x: cosine_similarity(x, query_embedding))
101
- return df.sort_values("similarity", ascending=False).head(5).to_json(orient="records")
102
-
103
- """, language='python')
104
-
105
- st.header("Dataset hosting")
106
-
107
- st.write("The csv file of the dataset is avalaible in the data folder")