Spaces:

Geraldine
/

HAL-UNIV-COTEDAZUR_semantic_search

Sleeping

App Files Files Community

HAL-UNIV-COTEDAZUR_semantic_search / app.py

Geraldine

Update app.py

ce243f1 about 1 year ago

raw

history blame

No virus

2.49 kB

	import streamlit as st
	import streamlit.components.v1 as components
	import pandas as pd
	import numpy as np
	import torch
	from sentence_transformers import SentenceTransformer, util

	# Set Streamlit page configuration
	st.set_page_config(page_title="App", layout="wide")

	st.title("Semantic Search on HAL UNIV-COTEDAZUR Collection (articles)")

	with st.spinner('Loading dataset...'):
	df = pd.read_csv("data/hal_articles.csv", sep=",", encoding="utf-8")
	df = df.replace(np.nan, '')
	df = df.astype(str)

	def llm_response(query, model_option):
	embedder = SentenceTransformer(model_option)
	question_embedding = embedder.encode(query, convert_to_tensor=True)
	corpus_embeddings = torch.load(f"embeddings/embeddings_{model_option}.pt", map_location=torch.device('cpu'))
	hits = util.semantic_search(question_embedding, corpus_embeddings, top_k=5)
	article_data_list = []
	data_list = []
	for hit in hits[0]:
	hit_id = hit['corpus_id']
	article_data = df.iloc[hit_id]
	#article_data_list.append(article_data["combined"])
	article_data_list.append({"title": article_data["title_s"] + ". " + article_data["subTitle_s"],
	"date": article_data["producedDate_s"],
	"journal" : article_data["journalTitle_s"],
	"pub": article_data["journalPublisher_s"],
	"abstract": article_data["abstract_s"]
	})
	return article_data_list

	models = ['all-MiniLM-L6-v2', 'all-mpnet-base-v2']
	model_option = st.sidebar.selectbox("Choose the open embeddings model to use ?", models)

	#OPENAI_API_KEY = st.sidebar.text_input(":green[Optional : Enter your OPENAi API KEY here :]")

	with st.container():
	if query := st.text_input(
	"Enter your question :"):
	st.markdown(f"### :green[{model_option} results]")
	with st.expander(":blue[click here to see the HAL search engine results]"):
	components.iframe(f"https://hal.univ-cotedazur.fr/search/index/?q={query}&rows=30&docType_s=ART", height=800, scrolling=True)
	with st.spinner('Calculating...'):
	response = llm_response(query, model_option)
	for x in response:
	st.success("Title : " + x["title"] + " \n " + "Date : " + x["date"] + " \n " + "Journal : " + x["journal"] + "(" + x["pub"] + ")" + " \n " + "Abstract : " + x["abstract"])