Spaces:
Sleeping
Sleeping
import datetime | |
import json | |
from langchain_community.retrievers import BM25Retriever | |
from langchain_core.documents import Document | |
import streamlit as st | |
def load_docs_from_json(json_path): | |
with open(json_path) as f: | |
papers = json.load(f) | |
docs = [] | |
for paper in papers: | |
page_content = f"Title: {paper['title']}\n\nAbstract: {paper['abstract']}" | |
doc = Document( | |
page_content=page_content, | |
metadata={ | |
'title': paper['title'], | |
'link': paper['link'], | |
'authors': paper['authors'], | |
'submitter': paper['submitter'], | |
'date': paper['date'], | |
} | |
) | |
docs.append(doc) | |
return docs | |
# init | |
json_path = "hf_daily_papers_2023-05-04_2024-06-27.json" | |
docs = load_docs_from_json(json_path) | |
retriever = BM25Retriever.from_documents(docs) | |
retriever.k = 10 | |
dates = [datetime.datetime.strptime(doc.metadata['date'], '%Y-%m-%d') for doc in docs] | |
oldest_date = min(dates) | |
newest_date = max(dates) | |
# streamlit | |
st.title("HF Daily Papers Search") | |
st.markdown(f"Search papers from [HF daily papers](https://huggingface.co/papers).") | |
st.markdown(f"Nmber of documents: `{len(docs)}`.") | |
st.markdown(f"From `{oldest_date.strftime('%Y-%m-%d')}` to `{newest_date.strftime('%Y-%m-%d')}`.") | |
st.markdown("This app uses BM25, allowing you to search not only with keywords like \"machine learning\" but also with documents like \"How to generate synthetic data using LLM.\"") | |
col1, col2 = st.columns([4, 1]) | |
with col1: | |
user_query = st.text_input("Search anything...") | |
with col2: | |
if st.button('β'): | |
results = retriever.invoke(user_query) | |
st.text(f"Top n {len(results)} related papers") | |
for result in results: | |
with st.expander(label=result.metadata['title'], expanded=False): | |
for k in result.metadata: | |
st.write(f"{k}: {result.metadata[k]}") | |
st.divider() | |
st.markdown(result.page_content) |