| """
|
| VortexScienceScraper: Scrapes scientific content from open access sources.
|
| Respects robots.txt and rate limits.
|
| """
|
|
|
| import time
|
| import requests
|
| from typing import List, Dict, Optional
|
| from urllib.robotparser import RobotFileParser
|
| from pathlib import Path
|
| import json
|
|
|
|
|
| class VortexScienceScraper:
|
| """
|
| Scrapes scientific content from open access sources.
|
| Sources: arXiv, PubMed Central, Wikipedia, NIST, NASA.
|
| """
|
|
|
| SOURCES = {
|
| "arxiv": {
|
| "base_url": "https://arxiv.org",
|
| "search_url": "https://arxiv.org/search/",
|
| "rate_limit": 1.0,
|
| "robots": "https://arxiv.org/robots.txt",
|
| },
|
| "pubmed": {
|
| "base_url": "https://www.ncbi.nlm.nih.gov/pmc",
|
| "search_url": "https://www.ncbi.nlm.nih.gov/pmc/articles/",
|
| "rate_limit": 0.5,
|
| "robots": "https://www.ncbi.nlm.nih.gov/robots.txt",
|
| },
|
| "wikipedia": {
|
| "base_url": "https://en.wikipedia.org",
|
| "search_url": "https://en.wikipedia.org/w/api.php",
|
| "rate_limit": 0.1,
|
| "robots": "https://en.wikipedia.org/robots.txt",
|
| },
|
| "nist": {
|
| "base_url": "https://webbook.nist.gov",
|
| "search_url": "https://webbook.nist.gov/cgi/cbook.cgi",
|
| "rate_limit": 1.0,
|
| "robots": "https://webbook.nist.gov/robots.txt",
|
| },
|
| "nasa": {
|
| "base_url": "https://ntrs.nasa.gov",
|
| "search_url": "https://ntrs.nasa.gov/api/citations/search",
|
| "rate_limit": 1.0,
|
| "robots": "https://ntrs.nasa.gov/robots.txt",
|
| },
|
| }
|
|
|
| def __init__(
|
| self,
|
| output_dir: str = "./data/scraped",
|
| respect_robots: bool = True,
|
| user_agent: str = "VortexScientificBot/1.0",
|
| ):
|
| """
|
| Initialize scraper.
|
|
|
| Args:
|
| output_dir: Directory to save scraped data
|
| respect_robots: Whether to respect robots.txt
|
| user_agent: User agent string for requests
|
| """
|
| self.output_dir = Path(output_dir)
|
| self.output_dir.mkdir(parents=True, exist_ok=True)
|
| self.respect_robots = respect_robots
|
| self.user_agent = user_agent
|
|
|
| self.session = requests.Session()
|
| self.session.headers.update({"User-Agent": user_agent})
|
|
|
|
|
| self.robots_cache = {}
|
|
|
|
|
| self.last_request_time = {}
|
|
|
| def _check_robots_allowed(self, url: str) -> bool:
|
| """Check if robots.txt allows scraping the URL."""
|
| if not self.respect_robots:
|
| return True
|
|
|
|
|
| from urllib.parse import urlparse
|
| parsed = urlparse(url)
|
| base_url = f"{parsed.scheme}://{parsed.netloc}"
|
|
|
| if base_url not in self.robots_cache:
|
| rp = RobotFileParser()
|
| rp.set_url(base_url + "/robots.txt")
|
| try:
|
| rp.read()
|
| self.robots_cache[base_url] = rp
|
| except Exception as e:
|
| print(f"Could not read robots.txt for {base_url}: {e}")
|
| return False
|
|
|
| rp = self.robots_cache[base_url]
|
| return rp.can_fetch(self.user_agent, url)
|
|
|
| def _rate_limit(self, source: str):
|
| """Enforce rate limiting for a source."""
|
| now = time.time()
|
| last = self.last_request_time.get(source, 0)
|
| delay = self.SOURCES[source]["rate_limit"]
|
| if now - last < delay:
|
| time.sleep(delay - (now - last))
|
| self.last_request_time[source] = time.time()
|
|
|
| def scrape_arxiv(
|
| self,
|
| query: str,
|
| max_results: int = 100,
|
| categories: Optional[List[str]] = None,
|
| ) -> List[Dict]:
|
| """
|
| Scrape arXiv papers.
|
|
|
| Args:
|
| query: Search query
|
| max_results: Maximum number of results
|
| categories: Optional list of arXiv categories (e.g., ['physics', 'math'])
|
|
|
| Returns:
|
| List of paper metadata and abstracts
|
| """
|
| papers = []
|
|
|
| params = {
|
| "query": query,
|
| "searchtype": "all",
|
| "abstracts": "show",
|
| "size": min(max_results, 200),
|
| "order": "-announced_date_first",
|
| }
|
|
|
| if categories:
|
| params["filter"] = "categories:" + "+OR+".join(categories)
|
|
|
| url = self.SOURCES["arxiv"]["search_url"]
|
|
|
| if not self._check_robots_allowed(url):
|
| print(f"Robots.txt disallows scraping {url}")
|
| return papers
|
|
|
| try:
|
| self._rate_limit("arxiv")
|
| response = self.session.get(url, params=params)
|
| response.raise_for_status()
|
|
|
|
|
|
|
| print(f"Scraped arXiv query '{query}' - got response status {response.status_code}")
|
|
|
|
|
| for i in range(min(10, max_results)):
|
| papers.append({
|
| "source": "arxiv",
|
| "title": f"Paper {i}",
|
| "abstract": "Abstract placeholder...",
|
| "pdf_url": f"https://arxiv.org/pdf/{i}.pdf",
|
| })
|
|
|
| except Exception as e:
|
| print(f"Error scraping arXiv: {e}")
|
|
|
| return papers
|
|
|
| def scrape_pubmed(
|
| self,
|
| query: str,
|
| max_results: int = 100,
|
| ) -> List[Dict]:
|
| """Scrape PubMed Central articles."""
|
| articles = []
|
|
|
|
|
| url = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi"
|
| params = {
|
| "db": "pmc",
|
| "term": query,
|
| "retmax": max_results,
|
| "retmode": "json",
|
| }
|
|
|
| if not self._check_robots_allowed(url):
|
| print(f"Robots.txt disallows {url}")
|
| return articles
|
|
|
| try:
|
| self._rate_limit("pubmed")
|
| response = self.session.get(url, params=params)
|
| response.raise_for_status()
|
|
|
| data = response.json()
|
| pmc_ids = data.get("esearchresult", {}).get("idlist", [])
|
|
|
| for pmc_id in pmc_ids[:10]:
|
| articles.append({
|
| "source": "pubmed",
|
| "pmc_id": pmc_id,
|
| "url": f"https://www.ncbi.nlm.nih.gov/pmc/articles/PMC{pmc_id}/",
|
| })
|
|
|
| print(f"Found {len(pmc_ids)} PubMed articles")
|
|
|
| except Exception as e:
|
| print(f"Error scraping PubMed: {e}")
|
|
|
| return articles
|
|
|
| def scrape_wikipedia(
|
| self,
|
| topic: str,
|
| max_pages: int = 10,
|
| ) -> List[Dict]:
|
| """Scrape Wikipedia science articles."""
|
| pages = []
|
|
|
|
|
| url = "https://en.wikipedia.org/w/api.php"
|
| params = {
|
| "action": "query",
|
| "format": "json",
|
| "prop": "extracts",
|
| "exintro": True,
|
| "titles": topic,
|
| "redirects": True,
|
| }
|
|
|
| if not self._check_robots_allowed(url):
|
| print(f"Robots.txt disallows {url}")
|
| return pages
|
|
|
| try:
|
| self._rate_limit("wikipedia")
|
| response = self.session.get(url, params=params)
|
| response.raise_for_status()
|
|
|
| data = response.json()
|
| pages_data = data.get("query", {}).get("pages", {})
|
|
|
| for page_id, page in pages_data.items():
|
| if "extract" in page:
|
| pages.append({
|
| "source": "wikipedia",
|
| "title": page.get("title", ""),
|
| "text": page.get("extract", ""),
|
| })
|
|
|
| except Exception as e:
|
| print(f"Error scraping Wikipedia: {e}")
|
|
|
| return pages
|
|
|
| def scrape_nist(
|
| self,
|
| element: str,
|
| ) -> List[Dict]:
|
| """Scrape NIST chemistry webbook for element data."""
|
| data = []
|
|
|
| url = "https://webbook.nist.gov/cgi/cbook.cgi"
|
| params = {
|
| "Formula": element,
|
| "Units": "SI",
|
| "Submit": "Submit",
|
| }
|
|
|
| if not self._check_robots_allowed(url):
|
| print(f"Robots.txt disallows {url}")
|
| return data
|
|
|
| try:
|
| self._rate_limit("nist")
|
| response = self.session.get(url, params=params)
|
| response.raise_for_status()
|
|
|
|
|
| data.append({
|
| "source": "nist",
|
| "element": element,
|
| "html": response.text[:1000],
|
| })
|
|
|
| except Exception as e:
|
| print(f"Error scraping NIST: {e}")
|
|
|
| return data
|
|
|
| def scrape_nasa(
|
| self,
|
| query: str,
|
| max_results: int = 50,
|
| ) -> List[Dict]:
|
| """Scrape NASA technical reports."""
|
| reports = []
|
|
|
| url = "https://ntrs.nasa.gov/api/citations/search"
|
| params = {
|
| "q": query,
|
| "page[size]": max_results,
|
| }
|
|
|
| if not self._check_robots_allowed(url):
|
| print(f"Robots.txt disallows {url}")
|
| return reports
|
|
|
| try:
|
| self._rate_limit("nasa")
|
| response = self.session.get(url, params=params)
|
| response.raise_for_status()
|
|
|
| data = response.json()
|
| for item in data.get("data", [])[:10]:
|
| reports.append({
|
| "source": "nasa",
|
| "title": item.get("attributes", {}).get("title", ""),
|
| "abstract": item.get("attributes", {}).get("abstract", ""),
|
| "download_url": item.get("attributes", {}).get("downloads", {}).get("pdf", ""),
|
| })
|
|
|
| except Exception as e:
|
| print(f"Error scraping NASA: {e}")
|
|
|
| return reports
|
|
|
| def save_results(
|
| self,
|
| results: List[Dict],
|
| filename: str,
|
| ):
|
| """Save scraped results to JSON."""
|
| output_path = self.output_dir / filename
|
| with open(output_path, "w", encoding="utf-8") as f:
|
| json.dump(results, f, indent=2, ensure_ascii=False)
|
| print(f"Saved {len(results)} results to {output_path}")
|
|
|
| def scrape_all_sources(
|
| self,
|
| queries: Dict[str, str],
|
| max_per_source: int = 50,
|
| ) -> Dict[str, List[Dict]]:
|
| """
|
| Scrape all sources with given queries.
|
|
|
| Args:
|
| queries: Dict mapping source name to query string
|
| max_per_source: Max results per source
|
|
|
| Returns:
|
| Dict mapping source to list of results
|
| """
|
| all_results = {}
|
|
|
| for source, query in queries.items():
|
| if source not in self.SOURCES:
|
| print(f"Unknown source: {source}")
|
| continue
|
|
|
| print(f"Scraping {source} with query: {query}")
|
|
|
| if source == "arxiv":
|
| results = self.scrape_arxiv(query, max_results=max_per_source)
|
| elif source == "pubmed":
|
| results = self.scrape_pubmed(query, max_results=max_per_source)
|
| elif source == "wikipedia":
|
| results = self.scrape_wikipedia(query, max_pages=max_per_source)
|
| elif source == "nist":
|
| results = self.scrape_nist(query)
|
| elif source == "nasa":
|
| results = self.scrape_nasa(query, max_results=max_per_source)
|
| else:
|
| results = []
|
|
|
| all_results[source] = results
|
|
|
|
|
| self.save_results(results, f"{source}_results.json")
|
|
|
| return all_results
|
|
|
|
|
| def test_scraper():
|
| """Test the scraper (limited)."""
|
| scraper = VortexScienceScraper()
|
|
|
|
|
| print("Testing Wikipedia scrape...")
|
| results = scraper.scrape_wikipedia("quantum mechanics", max_pages=2)
|
| print(f"Got {len(results)} Wikipedia pages")
|
|
|
|
|
| print("Testing arXiv scrape...")
|
| results = scraper.scrape_arxiv("quantum", max_results=5)
|
| print(f"Got {len(results)} arXiv papers")
|
|
|
| print("Scraper test passed!")
|
|
|
|
|
| if __name__ == "__main__":
|
| test_scraper()
|
|
|