|
import streamlit as st |
|
|
|
|
|
st.set_page_config(page_title="Documentation", layout="wide") |
|
|
|
|
|
st.title("Documentation") |
|
|
|
st.header("Dataset creation") |
|
|
|
st.subheader(":blue[HAL API harvest]") |
|
|
|
st.write("HAL is the french national open archive for scientific publications based on the principles of open access and self-archiving.") |
|
st.write("All the API documentation is available [here](https://api.archives-ouvertes.fr/docs/search)") |
|
st.write("All records of article type publications reported in the UNIV-COTEDAZUR collection of HAL are obtained with this looping function that populates a pandas Dataframe as output ") |
|
st.code(""" |
|
# we retrieve first the total number of records |
|
url_for_total_count = "https://api.archives-ouvertes.fr/search/UNIV-COTEDAZUR/?q=docType_s:ART&rows=0" |
|
response = requests.request("GET", url_for_total_count).text |
|
data = json.loads(response) |
|
total_count = data["response"]["numFound"] |
|
|
|
""", language='python') |
|
st.code(""" |
|
step = 1000 |
|
df = [] |
|
for i in range(1, int(total_count), int(step)): |
|
url = f"https://api.archives-ouvertes.fr/search/UNIV-COTEDAZUR/?q=docType_s:ART&rows={step}&start={i}&wt=csv&fl=uri_s,title_s,subTitle_s,authFullName_s,producedDate_s,domain_t,journalTitle_s,journalPublisher_s,abstract_s" |
|
data = pd.read_csv(url, encoding="utf-8") |
|
df.append(data) |
|
df = pd.concat(df) |
|
# clean up a little bit |
|
df = df.drop_duplicates(subset=['uri_s']) |
|
df = df.replace(np.nan, '') |
|
""", language='python') |
|
|
|
st.write("The dataframe's colmumns of metadata are then concatenated into a single combined text in a new column. It is therefore on this new column that the different embeddings models will be applied to encode this combined text and output a single vector embedding.") |
|
st.code(""" |
|
df = df.astype(str) |
|
df["combined"] = df.title_s + ". " + df.subTitle_s + ". " +df.abstract_s |
|
""", language='python') |
|
|
|
st.subheader(":blue[Huggingface open models for Embeddings]") |
|
|
|
st.write("The open source Huggingface platform hosts a large number of pre-trained models that can then be reused for many tasks (text or image classification, summarization, document QA etc...). We can then use the sentence-transformers library applied on some of these available embedding pre-trained models for creating embeddings.") |
|
st.write("There is two ways of working with the Huggingface hosted models : by using the [inference API endpoint](https://huggingface.co/inference-api) or by locally importing the model. Here we choose the second way") |
|
st.write("Two open source transformers-based models have been used to convert the textual metadata into numerical vector representation, which generated two vector embeddings datasets : embeddings_all-MiniLM-L6-v2.pt and embeddings_multi-qa-mpnet-base-dot-v1.pt") |
|
st.code(""" |
|
import torch |
|
from sentence_transformers import SentenceTransformer |
|
|
|
embedder = SentenceTransformer('all-MiniLM-L6-v2') # or 'multi-qa-mpnet-base-dot-v1' |
|
|
|
corpus_embeddings = embedder.encode(df.combined, convert_to_tensor=True) |
|
|
|
# how to save and reload |
|
torch.save(corpus_embeddings, f"{LOCAL_PATH}/embeddings_all-MiniLM-L6-v2.pt") |
|
corpus_embeddings = torch.load(f"{LOCAL_PATH}/embeddings_all-MiniLM-L6-v2.pt") |
|
|
|
""", language='python') |
|
|
|
st.subheader(":blue[Bonus : OpenAI Embeddings]") |
|
|
|
st.write("If you want to do the same with text-embedding-ada-002 (the OpenAI embeddings model)") |
|
|
|
st.code(""" |
|
import openai |
|
import tiktoken |
|
from openai.embeddings_utils import get_embedding |
|
|
|
openai.api_key = os.getenv("OPENAI_API_KEY") |
|
|
|
# embedding model parameters |
|
embedding_model = "text-embedding-ada-002" |
|
embedding_encoding = "cl100k_base" # this the encoding for text-embedding-ada-002 |
|
max_tokens = 8000 # the maximum for text-embedding-ada-002 is 8191 |
|
|
|
# filtering dataset on text under the max tokens limit |
|
encoding = tiktoken.get_encoding(embedding_encoding) |
|
df["n_tokens"] = df.combined.apply(lambda x: len(encoding.encode(x))) |
|
df = df[df.n_tokens <= max_tokens] |
|
|
|
# generate embeddings |
|
def custom_get_embedding(text: str) -> list[float]: |
|
return openai.Embedding.create(input=[text], model="text-embedding-ada-002")["data"][0]["embedding"] |
|
df["openai_embedding"] = df.combined.apply(lambda x: custom_get_embedding(x) ) |
|
|
|
""", language='python') |
|
|
|
st.write("And the Steamlit UI code would be :") |
|
|
|
st.code(""" |
|
df["openai_embedding"] = df.openai_embedding.apply(literal_eval).apply(np.array) |
|
def custom_get_embedding(text: str) -> list[float]: |
|
return openai.Embedding.create(input=[text], model="text-embedding-ada-002", openai_api_key=OPENAI_API_KEY)["data"][0]["embedding"] |
|
def openai_response(query): |
|
query_embedding = np.array(custom_get_embedding( |
|
query |
|
)) |
|
df["similarity"] = df.openai_embedding.apply(lambda x: cosine_similarity(x, query_embedding)) |
|
return df.sort_values("similarity", ascending=False).head(5).to_json(orient="records") |
|
|
|
""", language='python') |
|
|
|
st.header("Dataset hosting") |
|
|
|
st.write("The csv file of the dataset is avalaible in the data folder") |