Spaces:

taesiri
/

ClaudeReadsArxiv

Paused

App Files Files Community

ClaudeReadsArxiv / app.py

taesiri

update

76cda31 8 months ago

raw history blame

No virus

9.32 kB

	import os
	import re
	import tempfile
	import os

	import arxiv
	import gradio as gr
	import requests
	from anthropic import AI_PROMPT, HUMAN_PROMPT, Anthropic
	from arxiv_latex_extractor import get_paper_content
	from fastapi.staticfiles import StaticFiles
	from huggingface_hub import HfApi


	from coreservice import app


	LEADING_PROMPT = "Read the following paper:"

	# with open("assets/custom.css", "r", encoding="utf-8") as f:
	# custom_css = f.read()

	custom_css = """
	div#component-4 #chatbot {
	height: 800px !important;
	}

	"""


	def replace_texttt(text):
	return re.sub(r"\\texttt\{(.?)\}", r"\1*", text)


	def get_paper_info(paper_id):
	# Create a search query with the arXiv ID
	search = arxiv.Search(id_list=[paper_id])

	# Fetch the paper using its arXiv ID
	paper = next(search.results(), None)

	if paper is not None:
	# Return the paper's title and abstract
	# remove new lines
	title_ = paper.title.replace("\n", " ").replace("\r", " ")
	summary_ = paper.summary.replace("\n", " ").replace("\r", " ")
	return title_, summary_
	else:
	return None, None


	def get_paper_from_huggingface(paper_id):
	try:
	url = f"https://huggingface.co/datasets/taesiri/arxiv_db/raw/main/papers/{paper_id}.tex"
	response = requests.get(url)
	response.raise_for_status()
	return response.text
	except Exception as e:
	return None


	class ContextualQA:
	def __init__(self, client, model="claude-2.0"):
	self.client = client
	self.model = model
	self.context = ""
	self.questions = []
	self.responses = []

	def load_text(self, text):
	self.context = text

	def ask_question(self, question):
	if self.questions:
	# For the first question-answer pair, don't add HUMAN_PROMPT before the question
	first_pair = f"Question: {self.questions[0]}\n{AI_PROMPT} Answer: {self.responses[0]}"
	# For subsequent questions, include both HUMAN_PROMPT and AI_PROMPT
	subsequent_pairs = "\n".join(
	[
	f"{HUMAN_PROMPT} Question: {q}\n{AI_PROMPT} Answer: {a}"
	for q, a in zip(self.questions[1:], self.responses[1:])
	]
	)
	history_context = f"{first_pair}\n{subsequent_pairs}"
	else:
	history_context = ""

	full_context = f"{self.context}\n\n{history_context}\n"

	prompt = f"{HUMAN_PROMPT} {full_context} {HUMAN_PROMPT} {question} {AI_PROMPT}"

	response = self.client.completions.create(
	prompt=prompt,
	stop_sequences=[HUMAN_PROMPT],
	max_tokens_to_sample=6000,
	model=self.model,
	stream=False,
	)
	answer = response.completion
	self.questions.append(question)
	self.responses.append(answer)
	return answer

	def clear_context(self):
	self.context = ""
	self.questions = []
	self.responses = []

	def __getstate__(self):
	state = self.__dict__.copy()
	del state["client"]
	return state

	def __setstate__(self, state):
	self.__dict__.update(state)
	self.client = None


	def clean_paper_id(raw_id):
	# Remove any leading/trailing spaces
	cleaned_id = raw_id.strip()

	# Extract paper ID from ArXiv URL if present
	match = re.search(r"arxiv\.org\/abs\/([\w\.]+)", cleaned_id)
	if match:
	cleaned_id = match.group(1)
	else:
	# Remove trailing dot if present
	cleaned_id = re.sub(r"\.$", "", cleaned_id)

	return cleaned_id


	def load_context(paper_id):
	global LEADING_PROMPT

	# Clean the paper_id to remove spaces or extract ID from URL
	paper_id = clean_paper_id(paper_id)

	# Check if the paper is already on Hugging Face
	latex_source = get_paper_from_huggingface(paper_id)
	paper_downloaded = False

	# If not found on Hugging Face, use arxiv_latex_extractor
	if not latex_source:
	try:
	latex_source = get_paper_content(paper_id)
	paper_downloaded = True
	except Exception as e:
	return None, [(f"Error loading paper with id {paper_id}: {e}",)]

	if paper_downloaded:
	# Save the LaTeX content to a temporary file
	with tempfile.NamedTemporaryFile(
	mode="w+", suffix=".tex", delete=False
	) as tmp_file:
	tmp_file.write(latex_source)
	temp_file_path = tmp_file.name

	# Upload the paper to Hugging Face
	try:
	if os.path.getsize(temp_file_path) > 1:
	hf_api = HfApi(token=os.environ["HUGGINGFACE_TOKEN"])

	hf_api.upload_file(
	path_or_fileobj=temp_file_path,
	path_in_repo=f"papers/{paper_id}.tex",
	repo_id="taesiri/arxiv_db",
	repo_type="dataset",
	)
	except Exception as e:
	print(f"Error uploading paper with id {paper_id}: {e}")

	# Initialize the Anthropic client and QA model
	client = Anthropic(api_key=os.environ["ANTHROPIC_API_KEY"])
	qa_model = ContextualQA(client, model="claude-2.0")
	context = f"{LEADING_PROMPT}\n{latex_source}"
	qa_model.load_text(context)

	# Get the paper's title and abstract
	title, abstract = get_paper_info(paper_id)
	title = replace_texttt(title)
	abstract = replace_texttt(abstract)

	return (
	qa_model,
	[
	(
	f"Load the paper with id {paper_id}.",
	f"\nTitle: {title}\n\nAbstract: {abstract}\n\nPaper loaded. You can now ask questions.",
	)
	],
	)


	def answer_fn(qa_model, question, chat_history):
	# if question is empty, tell user that they need to ask a question
	if question == "":
	chat_history.append(("No Question Asked", "Please ask a question."))
	return qa_model, chat_history, ""

	client = Anthropic(api_key=os.environ["ANTHROPIC_API_KEY"])
	qa_model.client = client

	try:
	answer = qa_model.ask_question(question)
	except Exception as e:
	chat_history.append(("Error Asking Question", str(e)))
	return qa_model, chat_history, ""

	chat_history.append((question, answer))
	return qa_model, chat_history, ""


	def clear_context():
	return []


	with gr.Blocks(
	theme=gr.themes.Soft(), css=custom_css, title="ArXiv QA with Claude"
	) as demo:
	gr.HTML(
	"""
	<h1 style='text-align: center; font-size: 24px;'>
	Explore ArXiv Papers in Depth with <code>claude-2.0</code> - Ask Questions and Get Answers Instantly
	</h1>
	"""
	)
	# gr.HTML(
	# """
	# <p style='text-align: justify; font-size: 18px; margin: 10px;'>
	# Explore the depths of ArXiv papers with our interactive app, powered by the advanced <code>claude-2.0</code> model. Ask detailed questions and get immediate, context-rich answers from academic papers.
	# </p>
	# """
	# )

	gr.HTML(
	"""
	<center>
	<a href="https://huggingface.co/spaces/taesiri/ClaudeReadsArxiv?duplicate=true">
	<img src="https://bit.ly/3gLdBN6" alt="Duplicate Space" style="vertical-align: middle; max-width: 100px; margin-right: 10px;">
	</a>
	<span style="font-size: 14px; vertical-align: middle;">
	Duplicate the Space with your Anthropic API Key  \|
	Follow me on Twitter for more updates: <a href="https://twitter.com/taesiri" target="_blank">@taesiri</a>
	</span>
	</center>
	"""
	)

	with gr.Row().style(equal_height=False):
	with gr.Column(scale=2, emem_id="column-flex"):
	chatbot = gr.Chatbot(
	elem_id="chatbot",
	avatar_images=("./assets/user.png", "./assets/Claude.png"),
	)

	with gr.Column(scale=1):
	paper_id_input = gr.Textbox(label="Enter Paper ID", value="2310.12103")
	btn_load = gr.Button("Load Paper")
	qa_model = gr.State()

	question_txt = gr.Textbox(
	label="Question", lines=5, placeholder="Type your question here..."
	)

	btn_answer = gr.Button("Answer Question")
	btn_clear = gr.Button("Clear Chat")

	gr.HTML(
	"""<center>All the inputs are being sent to Anthropic's Claude endpoints. Please refer to <a href="https://legal.anthropic.com/#privacy">this link</a> for privacy policy.</center>"""
	)

	gr.Markdown(
	"## Acknowledgements\n"
	"This project is made possible through the generous support of "
	"[Anthropic](https://www.anthropic.com/), who provided free access to the `claude-2.0` API."
	)

	btn_load.click(load_context, inputs=[paper_id_input], outputs=[qa_model, chatbot])

	btn_answer.click(
	answer_fn,
	inputs=[qa_model, question_txt, chatbot],
	outputs=[qa_model, chatbot, question_txt],
	)

	question_txt.submit(
	answer_fn,
	inputs=[qa_model, question_txt, chatbot],
	outputs=[qa_model, chatbot, question_txt],
	)

	btn_clear.click(clear_context, outputs=[chatbot])


	app.mount("/js", StaticFiles(directory="js"), name="js")
	gr.mount_gradio_app(app, demo, path="/")