# Copyright 2023 by Jan Philip Wahle, https://jpwahle.com/ # All rights reserved. import asyncio import json import aiohttp import requests from bs4 import BeautifulSoup async def fetch(session, url): """Asynchronous function to fetch a URL using aiohttp.""" async with session.get(url) as response: return await response.text() async def async_match_acl_id_to_s2_paper(acl_id): """ Fetches the paper information from the Semantic Scholar API for the given ACL ID. Args: acl_id (str): The ACL ID of the paper to fetch. Returns: dict: A dictionary containing the paper information. """ url = f"https://api.semanticscholar.org/graph/v1/paper/ACL:{acl_id}" async with aiohttp.ClientSession() as session: res_text = await fetch(session, url) return json.loads(res_text) def extract_paper_info(paper_url): """ Extracts information about a paper from its ACL Anthology URL. Args: paper_url (str): The URL of the paper on the ACL Anthology website. Returns: dict: A dictionary containing the title, authors, and ACL Anthology ID of the paper. """ html_doc = requests.get(paper_url, timeout=10).text soup = BeautifulSoup(html_doc, "html.parser") title = soup.find("h2", id="title").text.strip() authors = [ a.text for a in soup.find_all("a") if a.parent.name == "p" and a.parent["class"] == ["lead"] ] acl_id = paper_url.split("/")[-2] return {"title": title, "authors": authors, "acl_id": acl_id} def extract_author_info(author_url): """ Extracts author information from the given author URL. Args: author_url (str): The URL of the author's page on ACL Anthology. Returns: dict: A dictionary containing the author's name and a list of their papers. Each paper is represented as a dictionary with keys "title" and "url". """ html_doc = requests.get(author_url, timeout=10).text soup = BeautifulSoup(html_doc, "html.parser") author_name = soup.find("h2", id="title").text.strip() paper_elements = soup.find_all("p") papers = [] for paper in paper_elements: links = paper.find_all("a") # Filter out a with text pdf and bib links = [ l for l in links if l.text.strip() not in ["pdf", "bib", "abs"] ] if not links: continue title = links[0].text.strip() url = "https://aclanthology.org" + links[0]["href"] papers.append({"title": title, "url": url}) return {"author": author_name, "papers": papers} def extract_venue_info(venue_url): """ Extracts venue information from the given URL. Args: venue_url (str): The URL of the venue to extract information from. Returns: dict: A dictionary containing the venue name and a list of papers with their titles and URLs. """ html_doc = requests.get(venue_url, timeout=10).text soup = BeautifulSoup(html_doc, "html.parser") venue_name = soup.find("h2", id="title").text.strip() paper_elements = soup.find_all("p") papers = [] for paper in paper_elements: links = paper.find_all("a") # Filter out a with text pdf and bib links = [ l for l in links if l.text.strip() not in ["pdf", "bib", "abs"] ] if not links: continue title = links[0].text.strip() url = "https://aclanthology.org" + links[0]["href"] papers.append({"title": title, "url": url}) return {"venue": venue_name, "papers": papers} def determine_page_type(url): """ Determine the type of ACL Anthology page given its URL. Args: url (str): The URL to be checked. Returns: str: "paper", "author", or "venue". Returns None if the type can't be determined. """ # Extract last segments from the URL segments = [segment for segment in url.split("/") if segment] # Check if the URL points to an event (venue) if "events" in url or "volumes" in url: return "venue" # If URL ends in a pattern like "2023.acl-long.1" it's a paper if len(segments) > 1 and segments[-2].isnumeric() and "." in segments[-1]: return "paper" if "people" in url: return "author" # If none of the above rules apply, fetch the page and check its content try: html_doc = requests.get(url, timeout=10).text soup = BeautifulSoup(html_doc, "html.parser") # Check for unique elements specific to each page type if soup.find("h2", id="title"): return ( "author" if soup.find("a", href=True, text="Google Scholar") else "paper" ) elif soup.find("h1", text="Anthology Volume"): return "venue" except Exception as e: print(f"Error determining page type: {e}") return None if __name__ == "__main__": loop = asyncio.get_event_loop() urls = [ "https://aclanthology.org/2023.acl-long.1/", "https://aclanthology.org/people/a/anna-rogers/", "https://aclanthology.org/events/acl-2022/", ] for url in urls: if determine_page_type(url) == "paper": print(f"Paper: {url}") res = extract_paper_info(url) paper = loop.run_until_complete( async_match_acl_id_to_s2_paper(res["acl_id"]) ) print(paper) elif determine_page_type(url) == "author": print(f"Author: {url}") res = extract_author_info(url) tasks = [ async_match_acl_id_to_s2_paper(paper["url"].split("/")[-2]) for paper in res["papers"] ] s2_ids = loop.run_until_complete(asyncio.gather(*tasks)) for paper, s2_id in zip(res["papers"], s2_ids): print(paper["paperId"]) elif determine_page_type(url) == "venue": print(f"Venue: {url}") res = extract_venue_info(url) tasks = [ async_match_acl_id_to_s2_paper(paper["url"].split("/")[-2]) for paper in res["papers"] ] s2_ids = loop.run_until_complete(asyncio.gather(*tasks)) for paper, s2_id in zip(res["papers"], s2_ids): print(paper["paperId"])