import datetime import json from langchain_community.retrievers import BM25Retriever from langchain_core.documents import Document import streamlit as st def load_docs_from_json(json_path): with open(json_path) as f: papers = json.load(f) docs = [] for paper in papers: page_content = f"Title: {paper['title']}\n\nAbstract: {paper['abstract']}" doc = Document( page_content=page_content, metadata={ 'title': paper['title'], 'link': paper['link'], 'authors': paper['authors'], 'submitter': paper['submitter'], 'date': paper['date'], } ) docs.append(doc) return docs # init json_path = "hf_daily_papers_2023-05-04_2024-06-27.json" docs = load_docs_from_json(json_path) retriever = BM25Retriever.from_documents(docs) retriever.k = 10 dates = [datetime.datetime.strptime(doc.metadata['date'], '%Y-%m-%d') for doc in docs] oldest_date = min(dates) newest_date = max(dates) # streamlit st.title("HF Daily Papers Search") st.markdown(f"Search papers from [HF daily papers](https://huggingface.co/papers).") st.markdown(f"Nmber of documents: `{len(docs)}`.") st.markdown(f"From `{oldest_date.strftime('%Y-%m-%d')}` to `{newest_date.strftime('%Y-%m-%d')}`.") st.markdown("This app uses [BM25](https://en.wikipedia.org/wiki/Okapi_BM25), allowing you to search not only with keywords like \"machine learning\" \n but also with documents like \"How to generate synthetic data using LLM.\"") prompt = st.chat_input("Search anything...") if prompt: results = retriever.invoke(prompt) st.markdown(f"Top `{len(results)}` related papers") for result in results: with st.expander(label=result.metadata['title'], expanded=False): for k in result.metadata: st.write(f"{k}: {result.metadata[k]}") st.divider() st.markdown(result.page_content)