|
|
|
|
|
import time
|
|
|
|
import arxiv
|
|
import requests
|
|
from bs4 import BeautifulSoup
|
|
from datetime import datetime
|
|
|
|
from requests.adapters import HTTPAdapter
|
|
from urllib3 import Retry
|
|
|
|
|
|
|
|
from App_Function_Libraries.DB.DB_Manager import add_media_with_keywords
|
|
|
|
|
|
|
|
|
|
|
|
|
|
ARXIV_PAGE_SIZE = 10
|
|
|
|
|
|
def fetch_arxiv_pdf_url(paper_id):
|
|
base_url = f"http://export.arxiv.org/api/query?id_list={paper_id}"
|
|
|
|
|
|
retry_strategy = Retry(
|
|
total=3,
|
|
status_forcelist=[429, 500, 502, 503, 504],
|
|
backoff_factor=1
|
|
)
|
|
adapter = HTTPAdapter(max_retries=retry_strategy)
|
|
http = requests.Session()
|
|
http.mount("https://", adapter)
|
|
http.mount("http://", adapter)
|
|
|
|
try:
|
|
response = http.get(base_url)
|
|
response.raise_for_status()
|
|
|
|
time.sleep(2)
|
|
soup = BeautifulSoup(response.text, 'xml')
|
|
pdf_link = soup.find('link', title='pdf')['href']
|
|
return pdf_link
|
|
except requests.exceptions.RequestException as e:
|
|
print(f"**Error:** {e}")
|
|
return None
|
|
|
|
|
|
def search_arxiv(query):
|
|
client = arxiv.Client()
|
|
search = arxiv.Search(
|
|
query=query,
|
|
max_results=10,
|
|
sort_by=arxiv.SortCriterion.Relevance
|
|
)
|
|
|
|
results = []
|
|
for result in client.results(search):
|
|
results.append([
|
|
result.title,
|
|
result.entry_id.split('/')[-1],
|
|
', '.join(author.name for author in result.authors),
|
|
result.summary
|
|
])
|
|
|
|
return results
|
|
|
|
|
|
def fetch_arxiv_xml(paper_id):
|
|
base_url = "http://export.arxiv.org/api/query?id_list="
|
|
response = requests.get(base_url + paper_id)
|
|
response.raise_for_status()
|
|
return response.text
|
|
|
|
|
|
def parse_arxiv_feed(xml_content):
|
|
soup = BeautifulSoup(xml_content, 'xml')
|
|
entries = []
|
|
for entry in soup.find_all('entry'):
|
|
title = entry.title.text.strip()
|
|
paper_id = entry.id.text.strip().split('/abs/')[-1]
|
|
authors = ', '.join(author.find('name').text.strip() for author in entry.find_all('author'))
|
|
published = entry.published.text.strip().split('T')[0]
|
|
abstract = entry.summary.text.strip()
|
|
entries.append({
|
|
'id': paper_id,
|
|
'title': title,
|
|
'authors': authors,
|
|
'published': published,
|
|
'abstract': abstract
|
|
})
|
|
return entries
|
|
|
|
|
|
def build_query_url(query, author, year, start):
|
|
|
|
base_url = "http://export.arxiv.org/api/query?"
|
|
search_params = []
|
|
|
|
|
|
if query:
|
|
search_params.append(f"all:{query}")
|
|
if author:
|
|
search_params.append(f'au:"{author}"')
|
|
if year:
|
|
search_params.append(f'submittedDate:[{year}01010000 TO {year}12312359]')
|
|
|
|
search_query = "+AND+".join(search_params) if search_params else "all:*"
|
|
|
|
url = f"{base_url}search_query={search_query}&start={start}&max_results={ARXIV_PAGE_SIZE}"
|
|
return url
|
|
|
|
def convert_xml_to_markdown(xml_content):
|
|
soup = BeautifulSoup(xml_content, 'xml')
|
|
|
|
|
|
entry = soup.find('entry')
|
|
title = entry.find('title').text.strip()
|
|
authors = [author.find('name').text.strip() for author in entry.find_all('author')]
|
|
abstract = entry.find('summary').text.strip()
|
|
published = entry.find('published').text.strip()
|
|
|
|
categories = [category['term'] for category in entry.find_all('category')]
|
|
|
|
|
|
markdown = f"# {title}\n\n"
|
|
markdown += f"**Authors:** {', '.join(authors)}\n\n"
|
|
markdown += f"**Published Date:** {published}\n\n"
|
|
markdown += f"**Abstract:**\n\n{abstract}\n\n"
|
|
markdown += f"**Categories:** {', '.join(categories)}\n\n"
|
|
|
|
return markdown, title, authors, categories
|
|
|
|
|
|
def process_and_ingest_arxiv_paper(paper_id, additional_keywords):
|
|
try:
|
|
xml_content = fetch_arxiv_xml(paper_id)
|
|
markdown, title, authors, categories = convert_xml_to_markdown(xml_content)
|
|
|
|
keywords = f"arxiv,{','.join(categories)}"
|
|
if additional_keywords:
|
|
keywords += f",{additional_keywords}"
|
|
|
|
add_media_with_keywords(
|
|
url=f"https://arxiv.org/abs/{paper_id}",
|
|
title=title,
|
|
media_type='document',
|
|
content=markdown,
|
|
keywords=keywords,
|
|
prompt='No prompt for arXiv papers',
|
|
summary='arXiv paper ingested from XML',
|
|
transcription_model='None',
|
|
author=', '.join(authors),
|
|
ingestion_date=datetime.now().strftime('%Y-%m-%d')
|
|
)
|
|
|
|
return f"arXiv paper '{title}' ingested successfully."
|
|
except Exception as e:
|
|
return f"Error processing arXiv paper: {str(e)}"
|
|
|
|
|
|
|
|
|
|
|