Spaces:

eusholli
/

morris-bot

Sleeping

File size: 1,871 Bytes

599c2c0

import cloudscraper
from bs4 import BeautifulSoup
import time

BASE_URL = "https://www.lightreading.com"
AUTHOR_URL = f"{BASE_URL}/author/iain-morris"
TARGET_COUNT = 100
DELAY = 1  # polite wait between requests

# Create a scraper that bypasses Cloudflare protection
scraper = cloudscraper.create_scraper()

def fetch_page(url):
    """Fetch and parse a page from the given URL."""
    resp = scraper.get(url)
    resp.raise_for_status()
    return BeautifulSoup(resp.text, "html.parser")

def extract_article_links(soup):
    """Extract valid article links from a BeautifulSoup object."""
    links = []
    # Use the correct selector based on the actual HTML structure
    for a in soup.select('a.ListPreview-Title[data-testid="preview-default-title"]'):
        href = a['href']
        if href.startswith("/author/"):
            continue  # skip author links
        full_url = BASE_URL + href if href.startswith("/") else href
        links.append(full_url)
    return links

def scrape_latest_urls():
    """Scrape up to TARGET_COUNT article URLs from paginated author pages."""
    urls, seen = [], set()
    page_num = 1

    while len(urls) < TARGET_COUNT:
        page_url = f"{AUTHOR_URL}?page={page_num}"
        print(f"Fetching {page_url} …")
        soup = fetch_page(page_url)

        found = extract_article_links(soup)
        if not found:
            print("No more articles found; stopping.")
            break

        for u in found:
            if u not in seen:
                seen.add(u)
                urls.append(u)
                if len(urls) >= TARGET_COUNT:
                    break

        page_num += 1
        time.sleep(DELAY)

    return urls

if __name__ == "__main__":
    urls = scrape_latest_urls()
    print(f"\n✅ Collected {len(urls)} article URLs:\n")
    for idx, url in enumerate(urls, 1):
        print(f"{url}")