Spaces:
Running
Running
File size: 1,993 Bytes
b9f0901 309c9bf b9f0901 309c9bf b9f0901 309c9bf b9f0901 309c9bf 771786d 2f7da08 3e2bddf 2f7da08 bec3a32 2f7da08 bec3a32 87f166c 870bb58 3e2bddf |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 |
import datetime
import json
from langchain_community.retrievers import BM25Retriever
from langchain_core.documents import Document
import streamlit as st
def load_docs_from_json(json_path):
with open(json_path) as f:
papers = json.load(f)
docs = []
for paper in papers:
page_content = f"Title: {paper['title']}\n\nAbstract: {paper['abstract']}"
doc = Document(
page_content=page_content,
metadata={
'title': paper['title'],
'link': paper['link'],
'authors': paper['authors'],
'submitter': paper['submitter'],
'date': paper['date'],
}
)
docs.append(doc)
return docs
# init
json_path = "hf_daily_papers_2023-05-04_2024-06-27.json"
docs = load_docs_from_json(json_path)
retriever = BM25Retriever.from_documents(docs)
retriever.k = 10
dates = [datetime.datetime.strptime(doc.metadata['date'], '%Y-%m-%d') for doc in docs]
oldest_date = min(dates)
newest_date = max(dates)
# streamlit
st.title("HF Daily Papers Search")
st.markdown(f"Search papers from [HF daily papers](https://huggingface.co/papers).")
st.markdown(f"Nmber of documents: `{len(docs)}`.")
st.markdown(f"From `{oldest_date.strftime('%Y-%m-%d')}` to `{newest_date.strftime('%Y-%m-%d')}`.")
st.markdown("This app uses [BM25](https://en.wikipedia.org/wiki/Okapi_BM25), allowing you to search not only with keywords like \"machine learning\" \n but also with documents like \"How to generate synthetic data using LLM.\"")
prompt = st.chat_input("Search anything...")
if prompt:
results = retriever.invoke(prompt)
st.text(f"Top n `{len(results)}` related papers")
for result in results:
with st.expander(label=result.metadata['title'], expanded=False):
for k in result.metadata:
st.write(f"{k}: {result.metadata[k]}")
st.divider()
st.markdown(result.page_content) |