import datetime import json from langchain_community.retrievers import BM25Retriever from langchain_core.documents import Document import streamlit as st def load_docs_from_json(json_path): with open(json_path) as f: papers = json.load(f) docs = [] for paper in papers: page_content = f"Title: {paper['title']}\n\nAbstract: {paper['abstract']}" doc = Document( page_content=page_content, metadata={ 'title': paper['title'], 'link': paper['link'], 'authors': paper['authors'], 'submitter': paper['submitter'], 'date': paper['date'], } ) docs.append(doc) return docs # init json_path = "hf_daily_papers_2023-05-04_2024-06-27.json" docs = load_docs_from_json(json_path) retriever = BM25Retriever.from_documents(docs) retriever.k = 10 dates = [datetime.datetime.strptime(doc.metadata['date'], '%Y-%m-%d') for doc in docs] oldest_date = min(dates) newest_date = max(dates) # streamlit st.title("HF Daily Papers Search") st.markdown(f"Search papers from [HF daily papers](https://huggingface.co/papers).\n\nNmber of documents: {len(docs)}\n\nfrom {oldest_date.strftime('%Y-%m-%d')} to {newest_date.strftime('%Y-%m-%d')}") user_query = st.text_input("Search anything...") if st.button('→'): results = retriever.invoke(user_query) st.text(f"hit {len(results)} papers") for result in results: with st.expander(label=result.metadata['title'], expanded=False): for k in result.metadata: st.write(f"{k}: {result.metadata[k]}") st.divider() st.markdown(result.page_content)