field-diversity / aclanthology.py
jpwahle's picture
Initial commit
505fd08
# Copyright 2023 by Jan Philip Wahle, https://jpwahle.com/
# All rights reserved.
import asyncio
import json
import aiohttp
import requests
from bs4 import BeautifulSoup
async def fetch(session, url):
"""Asynchronous function to fetch a URL using aiohttp."""
async with session.get(url) as response:
return await response.text()
async def async_match_acl_id_to_s2_paper(acl_id):
"""
Fetches the paper information from the Semantic Scholar API for the given ACL ID.
Args:
acl_id (str): The ACL ID of the paper to fetch.
Returns:
dict: A dictionary containing the paper information.
"""
url = f"https://api.semanticscholar.org/graph/v1/paper/ACL:{acl_id}"
async with aiohttp.ClientSession() as session:
res_text = await fetch(session, url)
return json.loads(res_text)
def extract_paper_info(paper_url):
"""
Extracts information about a paper from its ACL Anthology URL.
Args:
paper_url (str): The URL of the paper on the ACL Anthology website.
Returns:
dict: A dictionary containing the title, authors, and ACL Anthology ID of the paper.
"""
html_doc = requests.get(paper_url, timeout=10).text
soup = BeautifulSoup(html_doc, "html.parser")
title = soup.find("h2", id="title").text.strip()
authors = [
a.text
for a in soup.find_all("a")
if a.parent.name == "p" and a.parent["class"] == ["lead"]
]
acl_id = paper_url.split("/")[-2]
return {"title": title, "authors": authors, "acl_id": acl_id}
def extract_author_info(author_url):
"""
Extracts author information from the given author URL.
Args:
author_url (str): The URL of the author's page on ACL Anthology.
Returns:
dict: A dictionary containing the author's name and a list of their papers.
Each paper is represented as a dictionary with keys "title" and "url".
"""
html_doc = requests.get(author_url, timeout=10).text
soup = BeautifulSoup(html_doc, "html.parser")
author_name = soup.find("h2", id="title").text.strip()
paper_elements = soup.find_all("p")
papers = []
for paper in paper_elements:
links = paper.find_all("a")
# Filter out a with text pdf and bib
links = [
l for l in links if l.text.strip() not in ["pdf", "bib", "abs"]
]
if not links:
continue
title = links[0].text.strip()
url = "https://aclanthology.org" + links[0]["href"]
papers.append({"title": title, "url": url})
return {"author": author_name, "papers": papers}
def extract_venue_info(venue_url):
"""
Extracts venue information from the given URL.
Args:
venue_url (str): The URL of the venue to extract information from.
Returns:
dict: A dictionary containing the venue name and a list of papers with their titles and URLs.
"""
html_doc = requests.get(venue_url, timeout=10).text
soup = BeautifulSoup(html_doc, "html.parser")
venue_name = soup.find("h2", id="title").text.strip()
paper_elements = soup.find_all("p")
papers = []
for paper in paper_elements:
links = paper.find_all("a")
# Filter out a with text pdf and bib
links = [
l for l in links if l.text.strip() not in ["pdf", "bib", "abs"]
]
if not links:
continue
title = links[0].text.strip()
url = "https://aclanthology.org" + links[0]["href"]
papers.append({"title": title, "url": url})
return {"venue": venue_name, "papers": papers}
def determine_page_type(url):
"""
Determine the type of ACL Anthology page given its URL.
Args:
url (str): The URL to be checked.
Returns:
str: "paper", "author", or "venue". Returns None if the type can't be determined.
"""
# Extract last segments from the URL
segments = [segment for segment in url.split("/") if segment]
# Check if the URL points to an event (venue)
if "events" in url or "volumes" in url:
return "venue"
# If URL ends in a pattern like "2023.acl-long.1" it's a paper
if len(segments) > 1 and segments[-2].isnumeric() and "." in segments[-1]:
return "paper"
if "people" in url:
return "author"
# If none of the above rules apply, fetch the page and check its content
try:
html_doc = requests.get(url, timeout=10).text
soup = BeautifulSoup(html_doc, "html.parser")
# Check for unique elements specific to each page type
if soup.find("h2", id="title"):
return (
"author"
if soup.find("a", href=True, text="Google Scholar")
else "paper"
)
elif soup.find("h1", text="Anthology Volume"):
return "venue"
except Exception as e:
print(f"Error determining page type: {e}")
return None
if __name__ == "__main__":
loop = asyncio.get_event_loop()
urls = [
"https://aclanthology.org/2023.acl-long.1/",
"https://aclanthology.org/people/a/anna-rogers/",
"https://aclanthology.org/events/acl-2022/",
]
for url in urls:
if determine_page_type(url) == "paper":
print(f"Paper: {url}")
res = extract_paper_info(url)
paper = loop.run_until_complete(
async_match_acl_id_to_s2_paper(res["acl_id"])
)
print(paper)
elif determine_page_type(url) == "author":
print(f"Author: {url}")
res = extract_author_info(url)
tasks = [
async_match_acl_id_to_s2_paper(paper["url"].split("/")[-2])
for paper in res["papers"]
]
s2_ids = loop.run_until_complete(asyncio.gather(*tasks))
for paper, s2_id in zip(res["papers"], s2_ids):
print(paper["paperId"])
elif determine_page_type(url) == "venue":
print(f"Venue: {url}")
res = extract_venue_info(url)
tasks = [
async_match_acl_id_to_s2_paper(paper["url"].split("/")[-2])
for paper in res["papers"]
]
s2_ids = loop.run_until_complete(asyncio.gather(*tasks))
for paper, s2_id in zip(res["papers"], s2_ids):
print(paper["paperId"])