retrieval_metadata / arxiv_fetcher.py
donb-hf's picture
initial commit
edd8809
raw
history blame
1.51 kB
# arxiv_fetcher.py
import arxiv
from typing import List, Dict, Any
import logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
def fetch_arxiv_metadata(query: str, max_results: int = 10) -> List[Dict[str, Any]]:
logging.info(f"Fetching arXiv metadata for query: {query}")
if not query.strip():
logging.warning("Empty or whitespace-only query provided")
return []
client = arxiv.Client(page_size=max_results, delay_seconds=3, num_retries=3)
search = arxiv.Search(query=query, max_results=max_results, sort_by=arxiv.SortCriterion.SubmittedDate)
results = []
try:
for result in client.results(search):
metadata = {
"title": result.title,
"authors": [author.name for author in result.authors],
"published": result.published.isoformat(),
"updated": result.updated.isoformat(),
"pdf_url": result.pdf_url,
"entry_id": result.entry_id,
"summary": result.summary,
"categories": result.categories,
"primary_category": result.primary_category,
"html_url": f"http://arxiv.org/abs/{result.entry_id.split('/')[-1]}"
}
results.append(metadata)
logging.info(f"Fetched metadata for {len(results)} papers")
except Exception as e:
logging.error(f"Error fetching metadata: {str(e)}")
return results