Spaces:
Sleeping
Sleeping
import gradio as gr | |
from datetime import datetime, timedelta | |
import arxiv | |
from langchain.vectorstores import Chroma | |
from langchain.embeddings import HuggingFaceEmbeddings | |
from langchain.llms import HuggingFaceHub | |
from langchain.chains import LLMChain, StuffDocumentsChain | |
from langchain.prompts import PromptTemplate | |
from langchain.schema import Document | |
MAX_RESULTS = 100 | |
FORMAT = '%Y%m%d%H%M%S' | |
embeddings = HuggingFaceEmbeddings() | |
document_prompt = PromptTemplate( | |
template="Title: {title}\nContent: {page_content}", | |
input_variables=["page_content", "title"], | |
) | |
prompt = PromptTemplate( | |
template="""Write an engaging newsletter on the most recent exciting developments in the following field:"{context}". Base the newsletter on the articles below. Extract the most exciting points and combine them into an excillerating newsletter. Use emojis to catch attention and use the Markdown format.\n\n#ARTICLES\n"{text}"\n\nNEWSLETTER:\n# AI curated newsletter\n""", | |
input_variables=["context", "text"]) | |
REPO_ID = "HuggingFaceH4/starchat-beta" | |
llm = HuggingFaceHub( | |
repo_id=REPO_ID, | |
model_kwargs={ | |
"max_new_tokens": 1024, | |
"do_sample": True, | |
"temperature": 0.8, | |
"top_p": 0.9 | |
} | |
) | |
llm_chain = LLMChain(llm=llm, prompt=prompt, verbose=True) | |
stuff_chain = StuffDocumentsChain( | |
llm_chain=llm_chain, | |
document_variable_name="text", | |
document_prompt=document_prompt, | |
verbose=True, | |
) | |
def get_date_range(lookback_days: float): | |
max_date = datetime.today() | |
# Get the current date and time in UTC | |
now_utc = datetime.utcnow() | |
# Create a new datetime object for today at 18:00 UTC, which is the cutoff time for Arxiv submissions | |
today_1800_utc = datetime(now_utc.year, now_utc.month, now_utc.day - 2, 18, 0, 0) | |
min_date = today_1800_utc - timedelta(days=lookback_days) | |
return min_date, max_date | |
def get_documents(category: str, min_date: datetime, max_date: datetime): | |
# We use the arxiv package instead of Langchain's ArxivLoader, | |
# because the latter automatically loads pdfs which results in poor performance. | |
query = f"cat:{category} AND submittedDate:[{min_date.strftime(FORMAT)} TO {max_date.strftime(FORMAT)}]" | |
search = arxiv.Search( | |
query=query, | |
max_results=MAX_RESULTS, | |
sort_by=arxiv.SortCriterion.SubmittedDate | |
) | |
docs = [Document( | |
page_content=doc.summary, | |
metadata={ | |
"authors": ", ".join(map(str, doc.authors)), | |
"categories": ", ".join(map(str, doc.categories)), | |
"id": doc.get_short_id(), | |
"title": doc.title, | |
} | |
) for doc in search.results()] | |
return docs | |
def get_data(category: str, lookback_days: float, user_query: str): | |
print("User query:", user_query) | |
min_date, max_date = get_date_range(lookback_days) | |
docs = get_documents(category, min_date, max_date) | |
if len(docs) == 0: | |
return "Found no documents. Check if the category is correct or consider increasing the value for 'Articles from this many days in the past will be searched through.'." | |
db = Chroma.from_documents(docs, embeddings) | |
retriever = db.as_retriever() | |
relevant_docs = retriever.get_relevant_documents(user_query) | |
articles = "" | |
for doc in relevant_docs: | |
articles += f"**Title: {doc.metadata['title']}**\n\nAuthors: {doc.metadata['authors']}\n\nAbstract: {doc.page_content}\n\nID: {doc.metadata['id']}\n\n" | |
output = stuff_chain({"input_documents": relevant_docs, "context": user_query}) | |
output_text = output["output_text"].split("<|end|>")[0] | |
print("LLM output:", output_text) | |
return f"# Your AI curated newsletter\n{output_text}\n\n## This newsletter was AI generated by filtering {len(docs)} articles down to the following relevant articles:\n\n{articles}" | |
with gr.Blocks() as demo: | |
gr.Markdown( | |
""" | |
# Arxiv AI Curated Newsletter | |
Get a newsletter-style summary of today's Arxiv articles personalised to your field of research. | |
""" | |
) | |
with gr.Row(): | |
with gr.Column(): | |
with gr.Accordion("Parameters", open=False): | |
lookback_days = gr.Number(2, label="Articles from this many days in the past will be searched through.", minimum=1, maximum=7) | |
category = gr.Textbox(value="hep-th", label="Which category to search through. See https://arxiv.org/category_taxonomy for possible values.") | |
with gr.Box(): | |
gr.Markdown("Describe your field of research in a few words or sentences.") | |
input_text = gr.Textbox(placeholder="The relationship between Euclidean solutions to supergravity and black hole microstates.", container=False, show_label=False) | |
gr.Examples( | |
[["Supersymmetric Conformal Field Theory"], ["Black hole information paradox"]], | |
input_text, | |
) | |
button = gr.Button(value="Submit") | |
with gr.Column(): | |
with gr.Box(): | |
output = gr.Markdown("Press 'submit' to see your results.") | |
button.click(fn=get_data, inputs=[category, lookback_days,input_text], outputs=output) | |
demo.queue().launch() | |