Spaces:

oceansweep
/

tldw

Running

App Files Files Community

tldw / App_Function_Libraries /Third_Party /Arxiv.py

oceansweep

Upload 2 files

8619cce verified about 2 months ago

raw

history blame

5.53 kB

	# Arxiv.py
	# Description: This file contains the functions for searching and ingesting arXiv papers.
	import time

	import arxiv
	import requests
	from bs4 import BeautifulSoup
	from datetime import datetime

	from requests.adapters import HTTPAdapter
	from urllib3 import Retry

	#
	# Local Imports
	from App_Function_Libraries.DB.DB_Manager import add_media_with_keywords
	#
	#####################################################################################################
	#
	# Functions:

	# Number of results per page
	ARXIV_PAGE_SIZE = 10


	def fetch_arxiv_pdf_url(paper_id):
	base_url = f"http://export.arxiv.org/api/query?id_list={paper_id}"

	# Configure retry strategy
	retry_strategy = Retry(
	total=3, # Maximum number of retries
	status_forcelist=[429, 500, 502, 503, 504], # Retry on these status codes
	backoff_factor=1 # Exponential backoff factor
	)
	adapter = HTTPAdapter(max_retries=retry_strategy)
	http = requests.Session()
	http.mount("https://", adapter)
	http.mount("http://", adapter)

	try:
	response = http.get(base_url)
	response.raise_for_status()
	# Delay between requests to avoid rate limiting
	time.sleep(2)
	soup = BeautifulSoup(response.text, 'xml')
	pdf_link = soup.find('link', title='pdf')['href']
	return pdf_link
	except requests.exceptions.RequestException as e:
	print(f"Error: {e}")
	return None


	def search_arxiv(query):
	client = arxiv.Client()
	search = arxiv.Search(
	query=query,
	max_results=10,
	sort_by=arxiv.SortCriterion.Relevance
	)

	results = []
	for result in client.results(search):
	results.append([
	result.title,
	result.entry_id.split('/')[-1], # Extract the ID from the entry_id
	', '.join(author.name for author in result.authors),
	result.summary
	])

	return results


	def fetch_arxiv_xml(paper_id):
	base_url = "http://export.arxiv.org/api/query?id_list="
	response = requests.get(base_url + paper_id)
	response.raise_for_status()
	return response.text


	def parse_arxiv_feed(xml_content):
	soup = BeautifulSoup(xml_content, 'xml')
	entries = []
	for entry in soup.find_all('entry'):
	title = entry.title.text.strip()
	paper_id = entry.id.text.strip().split('/abs/')[-1]
	authors = ', '.join(author.find('name').text.strip() for author in entry.find_all('author'))
	published = entry.published.text.strip().split('T')[0]
	abstract = entry.summary.text.strip()
	entries.append({
	'id': paper_id,
	'title': title,
	'authors': authors,
	'published': published,
	'abstract': abstract
	})
	return entries


	def build_query_url(query, author, year, start):
	# HTTP? FIXME....
	base_url = "http://export.arxiv.org/api/query?"
	search_params = []

	# Build search query
	if query:
	search_params.append(f"all:{query}")
	if author:
	search_params.append(f'au:"{author}"')
	if year:
	search_params.append(f'submittedDate:[{year}01010000 TO {year}12312359]')

	search_query = "+AND+".join(search_params) if search_params else "all:*"

	url = f"{base_url}search_query={search_query}&start={start}&max_results={ARXIV_PAGE_SIZE}"
	return url

	def convert_xml_to_markdown(xml_content):
	soup = BeautifulSoup(xml_content, 'xml')

	# Extract title, authors, abstract, and other relevant information from the specific paper entry
	entry = soup.find('entry')
	title = entry.find('title').text.strip()
	authors = [author.find('name').text.strip() for author in entry.find_all('author')]
	abstract = entry.find('summary').text.strip()
	published = entry.find('published').text.strip()

	categories = [category['term'] for category in entry.find_all('category')]

	# Constructing a markdown representation for the paper
	markdown = f"# {title}\n\n"
	markdown += f"Authors: {', '.join(authors)}\n\n"
	markdown += f"Published Date: {published}\n\n"
	markdown += f"Abstract:\n\n{abstract}\n\n"
	markdown += f"Categories: {', '.join(categories)}\n\n"

	return markdown, title, authors, categories


	def process_and_ingest_arxiv_paper(paper_id, additional_keywords):
	try:
	xml_content = fetch_arxiv_xml(paper_id)
	markdown, title, authors, categories = convert_xml_to_markdown(xml_content)

	keywords = f"arxiv,{','.join(categories)}"
	if additional_keywords:
	keywords += f",{additional_keywords}"

	add_media_with_keywords(
	url=f"https://arxiv.org/abs/{paper_id}",
	title=title,
	media_type='document',
	content=markdown,
	keywords=keywords,
	prompt='No prompt for arXiv papers',
	summary='arXiv paper ingested from XML',
	transcription_model='None',
	author=', '.join(authors),
	ingestion_date=datetime.now().strftime('%Y-%m-%d')
	)

	return f"arXiv paper '{title}' ingested successfully."
	except Exception as e:
	return f"Error processing arXiv paper: {str(e)}"

	#
	# End of Arxiv.py
	####################################################################################################