Spaces:

vincentmin
/

ArxivNewsLetter

Sleeping

App Files Files Community

ArxivNewsLetter / app.py

vincentmin

Update app.py

42c6b22 over 1 year ago

raw

history blame

5.26 kB

	import gradio as gr
	from datetime import datetime, timedelta
	import arxiv
	from langchain.vectorstores import Chroma
	from langchain.embeddings import HuggingFaceEmbeddings
	from langchain.llms import HuggingFaceHub
	from langchain.chains import LLMChain, StuffDocumentsChain
	from langchain.prompts import PromptTemplate
	from langchain.schema import Document


	MAX_RESULTS = 100
	FORMAT = '%Y%m%d%H%M%S'

	embeddings = HuggingFaceEmbeddings()

	document_prompt = PromptTemplate(
	template="Title: {title}\nContent: {page_content}",
	input_variables=["page_content", "title"],
	)
	prompt = PromptTemplate(
	template="""Write an engaging newsletter on the most recent exciting developments in the following field:"{context}". Base the newsletter on the articles below. Extract the most exciting points and combine them into an excillerating newsletter. Use emojis to catch attention and use the Markdown format.\n\n#ARTICLES\n"{text}"\n\nNEWSLETTER:\n# AI curated newsletter\n""",
	input_variables=["context", "text"])

	REPO_ID = "HuggingFaceH4/starchat-beta"
	llm = HuggingFaceHub(
	repo_id=REPO_ID,
	model_kwargs={
	"max_new_tokens": 1024,
	"do_sample": True,
	"temperature": 0.8,
	"top_p": 0.9
	}
	)

	llm_chain = LLMChain(llm=llm, prompt=prompt, verbose=True)
	stuff_chain = StuffDocumentsChain(
	llm_chain=llm_chain,
	document_variable_name="text",
	document_prompt=document_prompt,
	verbose=True,
	)

	def get_date_range(lookback_days: float):
	max_date = datetime.today()
	# Get the current date and time in UTC
	now_utc = datetime.utcnow()
	# Create a new datetime object for today at 18:00 UTC, which is the cutoff time for Arxiv submissions
	today_1800_utc = datetime(now_utc.year, now_utc.month, now_utc.day - 2, 18, 0, 0)
	min_date = today_1800_utc - timedelta(days=lookback_days)
	return min_date, max_date

	def get_documents(category: str, min_date: datetime, max_date: datetime):
	# We use the arxiv package instead of Langchain's ArxivLoader,
	# because the latter automatically loads pdfs which results in poor performance.
	query = f"cat:{category} AND submittedDate:[{min_date.strftime(FORMAT)} TO {max_date.strftime(FORMAT)}]"
	search = arxiv.Search(
	query=query,
	max_results=MAX_RESULTS,
	sort_by=arxiv.SortCriterion.SubmittedDate
	)
	docs = [Document(
	page_content=doc.summary,
	metadata={
	"authors": ", ".join(map(str, doc.authors)),
	"categories": ", ".join(map(str, doc.categories)),
	"id": doc.get_short_id(),
	"title": doc.title,
	}
	) for doc in search.results()]
	return docs

	def get_data(category: str, lookback_days: float, user_query: str):
	print("User query:", user_query)

	min_date, max_date = get_date_range(lookback_days)
	docs = get_documents(category, min_date, max_date)
	if len(docs) == 0:
	return "Found no documents. Check if the category is correct or consider increasing the value for 'Articles from this many days in the past will be searched through.'."
	db = Chroma.from_documents(docs, embeddings)
	retriever = db.as_retriever()
	relevant_docs = retriever.get_relevant_documents(user_query)
	articles = ""
	for doc in relevant_docs:
	articles += f"Title: {doc.metadata['title']}\n\nAuthors: {doc.metadata['authors']}\n\nAbstract: {doc.page_content}\n\nID: {doc.metadata['id']}\n\n"
	output = stuff_chain({"input_documents": relevant_docs, "context": user_query})
	output_text = output["output_text"].split("<\|end\|>")[0]
	print("LLM output:", output_text)
	return f"# Your AI curated newsletter\n{output_text}\n\n## This newsletter was AI generated by filtering {len(docs)} articles down to the following relevant articles:\n\n{articles}"

	with gr.Blocks() as demo:
	gr.Markdown(
	"""
	# Arxiv AI Curated Newsletter


	Get a newsletter-style summary of today's Arxiv articles personalised to your field of research.
	"""
	)
	with gr.Row():
	with gr.Column():
	with gr.Accordion("Parameters", open=False):
	lookback_days = gr.Number(2, label="Articles from this many days in the past will be searched through.", minimum=1, maximum=7)

	category = gr.Textbox(value="hep-th", label="Which category to search through. See https://arxiv.org/category_taxonomy for possible values.")

	with gr.Box():
	gr.Markdown("Describe your field of research in a few words or sentences.")
	input_text = gr.Textbox(placeholder="The relationship between Euclidean solutions to supergravity and black hole microstates.", container=False, show_label=False)
	gr.Examples(
	[["Supersymmetric Conformal Field Theory"], ["Black hole information paradox"]],
	input_text,
	)

	button = gr.Button(value="Submit")

	with gr.Column():
	with gr.Box():
	output = gr.Markdown("Press 'submit' to see your results.")

	button.click(fn=get_data, inputs=[category, lookback_days,input_text], outputs=output)

	demo.queue().launch()