File size: 1,993 Bytes
b9f0901
309c9bf
 
 
 
 
 
 
 
 
 
 
 
 
b9f0901
309c9bf
b9f0901
 
 
 
 
 
 
 
 
 
309c9bf
 
 
 
 
 
 
 
 
 
b9f0901
 
 
 
309c9bf
771786d
2f7da08
 
 
3e2bddf
2f7da08
bec3a32
2f7da08
bec3a32
87f166c
870bb58
 
3e2bddf
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
import datetime
import json

from langchain_community.retrievers import BM25Retriever
from langchain_core.documents import Document
import streamlit as st


def load_docs_from_json(json_path):
    with open(json_path) as f:
        papers = json.load(f)

    docs = []
    for paper in papers:
        page_content = f"Title: {paper['title']}\n\nAbstract: {paper['abstract']}"
        doc = Document(
            page_content=page_content,
            metadata={
                'title': paper['title'],
                'link': paper['link'],
                'authors': paper['authors'],
                'submitter': paper['submitter'],
                'date': paper['date'],
                }
            )
        docs.append(doc)
    
    return docs


# init
json_path = "hf_daily_papers_2023-05-04_2024-06-27.json"
docs = load_docs_from_json(json_path)
retriever = BM25Retriever.from_documents(docs)
retriever.k = 10

dates = [datetime.datetime.strptime(doc.metadata['date'], '%Y-%m-%d') for doc in docs]
oldest_date = min(dates)
newest_date = max(dates)

# streamlit
st.title("HF Daily Papers Search")
st.markdown(f"Search papers from [HF daily papers](https://huggingface.co/papers).")
st.markdown(f"Nmber of documents: `{len(docs)}`.")
st.markdown(f"From `{oldest_date.strftime('%Y-%m-%d')}` to `{newest_date.strftime('%Y-%m-%d')}`.")
st.markdown("This app uses [BM25](https://en.wikipedia.org/wiki/Okapi_BM25), allowing you to search not only with keywords like \"machine learning\" \n  but also with documents like \"How to generate synthetic data using LLM.\"")

prompt = st.chat_input("Search anything...")

if prompt:
    results = retriever.invoke(prompt)
    
    st.text(f"Top n `{len(results)}` related papers")

    for result in results:
        with st.expander(label=result.metadata['title'], expanded=False):
            for k in result.metadata:
                st.write(f"{k}: {result.metadata[k]}")
            st.divider()
            st.markdown(result.page_content)