import datetime import json from langchain_community.retrievers import BM25Retriever from langchain_core.documents import Document import streamlit as st def load_docs_from_json(json_path): with open(json_path) as f: papers = json.load(f) docs = [] for paper in papers: page_content = f"Title: {paper['title']}\n\nAbstract: {paper['abstract']}" doc = Document( page_content=page_content, metadata={ 'title': paper['title'], 'link': paper['link'], 'authors': paper['authors'], 'submitter': paper['submitter'], 'date': paper['date'], } ) docs.append(doc) return docs # init json_path = "hf_daily_papers_2023-05-04_2024-06-27.json" docs = load_docs_from_json(json_path) retriever = BM25Retriever.from_documents(docs) retriever.k = 10 dates = [datetime.datetime.strptime(doc.metadata['date'], '%Y-%m-%d') for doc in docs] oldest_date = min(dates) newest_date = max(dates) # streamlit st.title("HF Daily Papers Search") st.markdown(f"Search papers from [HF daily papers](https://huggingface.co/papers).") st.markdown(f"Nmber of documents: `{len(docs)}`.") st.markdown(f"From `{oldest_date.strftime('%Y-%m-%d')}` to `{newest_date.strftime('%Y-%m-%d')}`.") st.markdown("This app uses BM25, allowing you to search not only with keywords like \"machine learning\" but also with documents like \"How to generate synthetic data using LLM.\"") col1, col2 = st.columns([4, 1]) with col1: user_query = st.text_input("Search anything...") with col2: if st.button('→'): results = retriever.invoke(user_query) st.text(f"Top n {len(results)} related papers") for result in results: with st.expander(label=result.metadata['title'], expanded=False): for k in result.metadata: st.write(f"{k}: {result.metadata[k]}") st.divider() st.markdown(result.page_content)