Spaces:

Geraldine
/

HAL-UNIV-COTEDAZUR_semantic_search

Sleeping

App Files Files Community

HAL-UNIV-COTEDAZUR_semantic_search / pages /documentation.py

Geraldine

Upload 3 files

01f708a about 1 year ago

raw

history blame

No virus

3.13 kB

	import streamlit as st

	# Set Streamlit page configuration
	st.set_page_config(page_title="Documentation", layout="wide")

	# Set up the Streamlit app layout
	st.title("Documentation")

	st.header("Dataset creation")

	st.subheader(":blue[HAL API harvest]")

	st.write("All the API documentation is available [here](https://api.archives-ouvertes.fr/docs/search)")
	st.write("All records of article type publications reported in the UNIV-COTEDAZUR collection of HAL are obtained with this recursive function that populates a pandas Dataframe as output ")
	st.code("""
	global_list = []
	def recursive_hal_harvest(cursor="*"):
	url = f"https://api.archives-ouvertes.fr/search/UNIV-COTEDAZUR/?q=docType_s:ART&rows=1000&cursorMark={cursor}&fl=uri_s,title_s,subTitle_s,authFullName_s,producedDate_s,domain_t,journalTitle_s,journalPublisher_s,anrProjectCallTitle_s,abstract_s&sort=docid asc"
	print(url)
	response = requests.request("GET", url).text
	data = json.loads(response)
	for doc in data["response"]["docs"]:
	global_list.append(doc)
	if len(data["response"]["docs"]) != 0:
	return recursive_hal_harvest(cursor=data["nextCursorMark"])
	else:
	return global_list
	df = pd.DataFrame(recursive_hal_harvest())

	""", language='python')

	st.write("The dataframe's colmumns of metadata are then concatenated into a single combined text in a new column. It is therefore on this new column that the different embeddings models will be applied to encode this combined text and output a single vector embedding.")
	st.code("""
	df = df.astype(str)
	df["combined"] = (
	"Title: " + df.title_s + ";Subtitle:" + df.subTitle_s + ";Author:" + df.authFullName_s + ";Date:" + df.producedDate_s + ";Journal Title:" + df.journalTitle_s + ";Publisher:" + df.journalPublisher_s + ";ANR Project:" + df.anrProjectCallTitle_s + "; Abstract: " + df.abstract_s
	)

	""", language='python')

	st.subheader(":blue[OpenAI Embeddings]")

	st.code("""
	import openai
	import tiktoken
	from openai.embeddings_utils import get_embedding

	openai.api_key = os.getenv("OPENAI_API_KEY")

	# embedding model parameters
	embedding_model = "text-embedding-ada-002"
	embedding_encoding = "cl100k_base" # this the encoding for text-embedding-ada-002
	max_tokens = 8000 # the maximum for text-embedding-ada-002 is 8191

	# filtering dataset on text under the max tokens limit
	encoding = tiktoken.get_encoding(embedding_encoding)
	df["n_tokens"] = df.combined.apply(lambda x: len(encoding.encode(x)))
	df = df[df.n_tokens <= max_tokens]

	# générate embeddings
	df["openai_embedding"] = df.combined.apply(lambda x: get_embedding(x, engine=embedding_model) )
	df["openai_embedding"] = df.embedding.astype(str).apply(eval).apply(np.array)

	""", language='python')

	st.subheader(":blue[Huggingface free models for Embeddings]")

	st.write("The open source Huggingface platform hosts a large number of pre-trained models that can then be reused for many tasks (text or image classification, summarization, document QA etc...). We can then use the popular sentence-transformers library applied on free available text embedding models for creating embeddings ")