Spaces:
Sleeping
Sleeping
| """ | |
| Paper Auto-Discovery (PAD) Module | |
| Provides intelligent paper search across multiple sources: | |
| - Semantic Scholar Graph API v1 | |
| - arXiv API | |
| Aggregates results and provides unified interface for paper discovery. | |
| """ | |
| import requests | |
| import xml.etree.ElementTree as ET | |
| from typing import List, Dict, Optional | |
| from concurrent.futures import ThreadPoolExecutor, as_completed | |
| import logging | |
| logger = logging.getLogger(__name__) | |
| class PaperSearchResult: | |
| """Represents a single paper search result""" | |
| def __init__( | |
| self, | |
| title: str, | |
| authors: List[str], | |
| year: Optional[int], | |
| abstract: str, | |
| url: str, | |
| pdf_url: Optional[str], | |
| source: str, # "semantic_scholar" or "arxiv" | |
| paper_id: str, | |
| ): | |
| self.title = title | |
| self.authors = authors | |
| self.year = year | |
| self.abstract = abstract | |
| self.url = url | |
| self.pdf_url = pdf_url | |
| self.source = source | |
| self.paper_id = paper_id | |
| def to_dict(self) -> Dict: | |
| """Convert to dictionary for easy JSON serialization""" | |
| return { | |
| "title": self.title, | |
| "authors": self.authors, | |
| "year": self.year, | |
| "abstract": self.abstract, | |
| "url": self.url, | |
| "pdf_url": self.pdf_url, | |
| "source": self.source, | |
| "paper_id": self.paper_id, | |
| } | |
| def __repr__(self): | |
| authors_str = ", ".join(self.authors[:3]) | |
| if len(self.authors) > 3: | |
| authors_str += " et al." | |
| return f"<PaperSearchResult: {self.title[:50]}... by {authors_str} ({self.year})>" | |
| class PaperDiscoveryEngine: | |
| """ | |
| PAD - Paper Auto-Discovery Engine | |
| Searches for research papers across multiple sources and returns | |
| unified results with PDF links when available. | |
| """ | |
| SEMANTIC_SCHOLAR_API = "https://api.semanticscholar.org/graph/v1/paper/search" | |
| ARXIV_API = "http://export.arxiv.org/api/query" | |
| def __init__(self, max_results: int = 5): | |
| self.max_results = max_results | |
| self.session = requests.Session() | |
| # Set user agent to avoid 403 errors | |
| self.session.headers.update({ | |
| "User-Agent": "PaperCast/1.0 (Research Paper Discovery; batuhan@papercast.io)" | |
| }) | |
| def search(self, query: str) -> List[PaperSearchResult]: | |
| """ | |
| Search for papers across all sources in parallel. | |
| Args: | |
| query: Search query (e.g., "diffusion models", "Grok reasoning") | |
| Returns: | |
| List of PaperSearchResult objects, sorted by relevance | |
| """ | |
| logger.info(f"PAD: Searching for '{query}'") | |
| results = [] | |
| # Run both API calls in parallel for speed | |
| with ThreadPoolExecutor(max_workers=2) as executor: | |
| future_semantic = executor.submit(self._search_semantic_scholar, query) | |
| future_arxiv = executor.submit(self._search_arxiv, query) | |
| # Collect results as they complete | |
| for future in as_completed([future_semantic, future_arxiv]): | |
| try: | |
| partial_results = future.result() | |
| results.extend(partial_results) | |
| except Exception as e: | |
| logger.error(f"PAD: Search failed for one source: {e}") | |
| # Deduplicate by title (case-insensitive) | |
| seen_titles = set() | |
| unique_results = [] | |
| for result in results: | |
| title_lower = result.title.lower().strip() | |
| if title_lower not in seen_titles: | |
| seen_titles.add(title_lower) | |
| unique_results.append(result) | |
| # Limit to max_results | |
| unique_results = unique_results[:self.max_results] | |
| logger.info(f"PAD: Found {len(unique_results)} unique papers") | |
| return unique_results | |
| def _search_semantic_scholar(self, query: str) -> List[PaperSearchResult]: | |
| """Search Semantic Scholar Graph API v1""" | |
| try: | |
| logger.debug("PAD: Querying Semantic Scholar...") | |
| params = { | |
| "query": query, | |
| "fields": "title,authors,year,abstract,openAccessPdf,url,paperId", | |
| "limit": self.max_results, | |
| } | |
| response = self.session.get( | |
| self.SEMANTIC_SCHOLAR_API, | |
| params=params, | |
| timeout=10 | |
| ) | |
| # Handle rate limiting gracefully - just skip Semantic Scholar | |
| if response.status_code == 429: | |
| logger.warning("PAD: Semantic Scholar rate limit exceeded (429). Relying on arXiv results.") | |
| return [] | |
| response.raise_for_status() | |
| data = response.json() | |
| papers = data.get("data", []) | |
| results = [] | |
| for paper in papers: | |
| # Extract PDF URL if available | |
| pdf_url = None | |
| if paper.get("openAccessPdf"): | |
| pdf_url = paper["openAccessPdf"].get("url") | |
| # Extract author names | |
| authors = [] | |
| for author in paper.get("authors", []): | |
| if "name" in author: | |
| authors.append(author["name"]) | |
| result = PaperSearchResult( | |
| title=paper.get("title", "Untitled"), | |
| authors=authors, | |
| year=paper.get("year"), | |
| abstract=paper.get("abstract", "No abstract available."), | |
| url=paper.get("url", ""), | |
| pdf_url=pdf_url, | |
| source="semantic_scholar", | |
| paper_id=paper.get("paperId", ""), | |
| ) | |
| results.append(result) | |
| logger.debug(f"PAD: Semantic Scholar returned {len(results)} papers") | |
| return results | |
| except Exception as e: | |
| logger.error(f"PAD: Semantic Scholar search failed: {e}") | |
| return [] | |
| def _search_arxiv(self, query: str) -> List[PaperSearchResult]: | |
| """Search arXiv API""" | |
| try: | |
| logger.debug("PAD: Querying arXiv...") | |
| params = { | |
| "search_query": f"all:{query}", | |
| "max_results": self.max_results, | |
| "sortBy": "relevance", | |
| "sortOrder": "descending", | |
| } | |
| response = self.session.get( | |
| self.ARXIV_API, | |
| params=params, | |
| timeout=10 | |
| ) | |
| response.raise_for_status() | |
| # Parse XML response | |
| root = ET.fromstring(response.content) | |
| # Define namespace | |
| ns = { | |
| "atom": "http://www.w3.org/2005/Atom", | |
| "arxiv": "http://arxiv.org/schemas/atom" | |
| } | |
| results = [] | |
| for entry in root.findall("atom:entry", ns): | |
| # Extract title | |
| title_elem = entry.find("atom:title", ns) | |
| title = title_elem.text.strip() if title_elem is not None else "Untitled" | |
| # Extract authors | |
| authors = [] | |
| for author in entry.findall("atom:author", ns): | |
| name_elem = author.find("atom:name", ns) | |
| if name_elem is not None: | |
| authors.append(name_elem.text.strip()) | |
| # Extract abstract | |
| summary_elem = entry.find("atom:summary", ns) | |
| abstract = summary_elem.text.strip() if summary_elem is not None else "No abstract available." | |
| # Extract URL (abstract page) | |
| url_elem = entry.find("atom:id", ns) | |
| url = url_elem.text.strip() if url_elem is not None else "" | |
| # Extract PDF URL | |
| pdf_url = None | |
| for link in entry.findall("atom:link", ns): | |
| if link.get("type") == "application/pdf": | |
| pdf_url = link.get("href") | |
| break | |
| # Extract year from published date | |
| published_elem = entry.find("atom:published", ns) | |
| year = None | |
| if published_elem is not None: | |
| try: | |
| year = int(published_elem.text[:4]) | |
| except (ValueError, TypeError): | |
| pass | |
| # Extract arXiv ID | |
| paper_id = url.split("/")[-1] if url else "" | |
| result = PaperSearchResult( | |
| title=title, | |
| authors=authors, | |
| year=year, | |
| abstract=abstract, | |
| url=url, | |
| pdf_url=pdf_url, | |
| source="arxiv", | |
| paper_id=paper_id, | |
| ) | |
| results.append(result) | |
| logger.debug(f"PAD: arXiv returned {len(results)} papers") | |
| return results | |
| except Exception as e: | |
| logger.error(f"PAD: arXiv search failed: {e}") | |
| return [] | |
| def get_pdf_url(self, result: PaperSearchResult) -> Optional[str]: | |
| """ | |
| Get the best available PDF URL for a search result. | |
| Returns direct PDF URL if available, otherwise returns the paper URL | |
| which can be processed by the existing fetching logic. | |
| """ | |
| if result.pdf_url: | |
| return result.pdf_url | |
| # For arXiv papers without direct PDF link, construct it | |
| if result.source == "arxiv" and result.url: | |
| # Convert abstract URL to PDF URL | |
| # https://arxiv.org/abs/2301.12345 -> https://arxiv.org/pdf/2301.12345.pdf | |
| return result.url.replace("/abs/", "/pdf/") + ".pdf" | |
| # Return the paper URL as fallback (existing logic can handle it) | |
| return result.url | |
| # Convenience function for easy import | |
| def search_papers(query: str, max_results: int = 5) -> List[PaperSearchResult]: | |
| """ | |
| Search for research papers across multiple sources. | |
| Args: | |
| query: Search query (e.g., "diffusion models", "Grok reasoning") | |
| max_results: Maximum number of results to return (default: 5) | |
| Returns: | |
| List of PaperSearchResult objects | |
| Example: | |
| >>> results = search_papers("transformer attention mechanisms") | |
| >>> for paper in results: | |
| >>> print(f"{paper.title} ({paper.year})") | |
| >>> print(f" PDF: {paper.pdf_url}") | |
| """ | |
| engine = PaperDiscoveryEngine(max_results=max_results) | |
| return engine.search(query) | |