ArxivNewsLetter / app.py
vincentmin's picture
Update app.py
42c6b22
raw
history blame
5.26 kB
import gradio as gr
from datetime import datetime, timedelta
import arxiv
from langchain.vectorstores import Chroma
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.llms import HuggingFaceHub
from langchain.chains import LLMChain, StuffDocumentsChain
from langchain.prompts import PromptTemplate
from langchain.schema import Document
MAX_RESULTS = 100
FORMAT = '%Y%m%d%H%M%S'
embeddings = HuggingFaceEmbeddings()
document_prompt = PromptTemplate(
template="Title: {title}\nContent: {page_content}",
input_variables=["page_content", "title"],
)
prompt = PromptTemplate(
template="""Write an engaging newsletter on the most recent exciting developments in the following field:"{context}". Base the newsletter on the articles below. Extract the most exciting points and combine them into an excillerating newsletter. Use emojis to catch attention and use the Markdown format.\n\n#ARTICLES\n"{text}"\n\nNEWSLETTER:\n# AI curated newsletter\n""",
input_variables=["context", "text"])
REPO_ID = "HuggingFaceH4/starchat-beta"
llm = HuggingFaceHub(
repo_id=REPO_ID,
model_kwargs={
"max_new_tokens": 1024,
"do_sample": True,
"temperature": 0.8,
"top_p": 0.9
}
)
llm_chain = LLMChain(llm=llm, prompt=prompt, verbose=True)
stuff_chain = StuffDocumentsChain(
llm_chain=llm_chain,
document_variable_name="text",
document_prompt=document_prompt,
verbose=True,
)
def get_date_range(lookback_days: float):
max_date = datetime.today()
# Get the current date and time in UTC
now_utc = datetime.utcnow()
# Create a new datetime object for today at 18:00 UTC, which is the cutoff time for Arxiv submissions
today_1800_utc = datetime(now_utc.year, now_utc.month, now_utc.day - 2, 18, 0, 0)
min_date = today_1800_utc - timedelta(days=lookback_days)
return min_date, max_date
def get_documents(category: str, min_date: datetime, max_date: datetime):
# We use the arxiv package instead of Langchain's ArxivLoader,
# because the latter automatically loads pdfs which results in poor performance.
query = f"cat:{category} AND submittedDate:[{min_date.strftime(FORMAT)} TO {max_date.strftime(FORMAT)}]"
search = arxiv.Search(
query=query,
max_results=MAX_RESULTS,
sort_by=arxiv.SortCriterion.SubmittedDate
)
docs = [Document(
page_content=doc.summary,
metadata={
"authors": ", ".join(map(str, doc.authors)),
"categories": ", ".join(map(str, doc.categories)),
"id": doc.get_short_id(),
"title": doc.title,
}
) for doc in search.results()]
return docs
def get_data(category: str, lookback_days: float, user_query: str):
print("User query:", user_query)
min_date, max_date = get_date_range(lookback_days)
docs = get_documents(category, min_date, max_date)
if len(docs) == 0:
return "Found no documents. Check if the category is correct or consider increasing the value for 'Articles from this many days in the past will be searched through.'."
db = Chroma.from_documents(docs, embeddings)
retriever = db.as_retriever()
relevant_docs = retriever.get_relevant_documents(user_query)
articles = ""
for doc in relevant_docs:
articles += f"**Title: {doc.metadata['title']}**\n\nAuthors: {doc.metadata['authors']}\n\nAbstract: {doc.page_content}\n\nID: {doc.metadata['id']}\n\n"
output = stuff_chain({"input_documents": relevant_docs, "context": user_query})
output_text = output["output_text"].split("<|end|>")[0]
print("LLM output:", output_text)
return f"# Your AI curated newsletter\n{output_text}\n\n## This newsletter was AI generated by filtering {len(docs)} articles down to the following relevant articles:\n\n{articles}"
with gr.Blocks() as demo:
gr.Markdown(
"""
# Arxiv AI Curated Newsletter
Get a newsletter-style summary of today's Arxiv articles personalised to your field of research.
"""
)
with gr.Row():
with gr.Column():
with gr.Accordion("Parameters", open=False):
lookback_days = gr.Number(2, label="Articles from this many days in the past will be searched through.", minimum=1, maximum=7)
category = gr.Textbox(value="hep-th", label="Which category to search through. See https://arxiv.org/category_taxonomy for possible values.")
with gr.Box():
gr.Markdown("Describe your field of research in a few words or sentences.")
input_text = gr.Textbox(placeholder="The relationship between Euclidean solutions to supergravity and black hole microstates.", container=False, show_label=False)
gr.Examples(
[["Supersymmetric Conformal Field Theory"], ["Black hole information paradox"]],
input_text,
)
button = gr.Button(value="Submit")
with gr.Column():
with gr.Box():
output = gr.Markdown("Press 'submit' to see your results.")
button.click(fn=get_data, inputs=[category, lookback_days,input_text], outputs=output)
demo.queue().launch()