Spaces:
Sleeping
Sleeping
# Copyright 2023 by Jan Philip Wahle, https://jpwahle.com/ | |
# All rights reserved. | |
import asyncio | |
import json | |
import aiohttp | |
import requests | |
from bs4 import BeautifulSoup | |
async def fetch(session, url): | |
"""Asynchronous function to fetch a URL using aiohttp.""" | |
async with session.get(url) as response: | |
return await response.text() | |
async def async_match_acl_id_to_s2_paper(acl_id): | |
""" | |
Fetches the paper information from the Semantic Scholar API for the given ACL ID. | |
Args: | |
acl_id (str): The ACL ID of the paper to fetch. | |
Returns: | |
dict: A dictionary containing the paper information. | |
""" | |
url = f"https://api.semanticscholar.org/graph/v1/paper/ACL:{acl_id}" | |
async with aiohttp.ClientSession() as session: | |
res_text = await fetch(session, url) | |
return json.loads(res_text) | |
def extract_paper_info(paper_url): | |
""" | |
Extracts information about a paper from its ACL Anthology URL. | |
Args: | |
paper_url (str): The URL of the paper on the ACL Anthology website. | |
Returns: | |
dict: A dictionary containing the title, authors, and ACL Anthology ID of the paper. | |
""" | |
html_doc = requests.get(paper_url, timeout=10).text | |
soup = BeautifulSoup(html_doc, "html.parser") | |
title = soup.find("h2", id="title").text.strip() | |
authors = [ | |
a.text | |
for a in soup.find_all("a") | |
if a.parent.name == "p" and a.parent["class"] == ["lead"] | |
] | |
acl_id = paper_url.split("/")[-2] | |
return {"title": title, "authors": authors, "acl_id": acl_id} | |
def extract_author_info(author_url): | |
""" | |
Extracts author information from the given author URL. | |
Args: | |
author_url (str): The URL of the author's page on ACL Anthology. | |
Returns: | |
dict: A dictionary containing the author's name and a list of their papers. | |
Each paper is represented as a dictionary with keys "title" and "url". | |
""" | |
html_doc = requests.get(author_url, timeout=10).text | |
soup = BeautifulSoup(html_doc, "html.parser") | |
author_name = soup.find("h2", id="title").text.strip() | |
paper_elements = soup.find_all("p") | |
papers = [] | |
for paper in paper_elements: | |
links = paper.find_all("a") | |
# Filter out a with text pdf and bib | |
links = [ | |
l for l in links if l.text.strip() not in ["pdf", "bib", "abs"] | |
] | |
if not links: | |
continue | |
title = links[0].text.strip() | |
url = "https://aclanthology.org" + links[0]["href"] | |
papers.append({"title": title, "url": url}) | |
return {"author": author_name, "papers": papers} | |
def extract_venue_info(venue_url): | |
""" | |
Extracts venue information from the given URL. | |
Args: | |
venue_url (str): The URL of the venue to extract information from. | |
Returns: | |
dict: A dictionary containing the venue name and a list of papers with their titles and URLs. | |
""" | |
html_doc = requests.get(venue_url, timeout=10).text | |
soup = BeautifulSoup(html_doc, "html.parser") | |
venue_name = soup.find("h2", id="title").text.strip() | |
paper_elements = soup.find_all("p") | |
papers = [] | |
for paper in paper_elements: | |
links = paper.find_all("a") | |
# Filter out a with text pdf and bib | |
links = [ | |
l for l in links if l.text.strip() not in ["pdf", "bib", "abs"] | |
] | |
if not links: | |
continue | |
title = links[0].text.strip() | |
url = "https://aclanthology.org" + links[0]["href"] | |
papers.append({"title": title, "url": url}) | |
return {"venue": venue_name, "papers": papers} | |
def determine_page_type(url): | |
""" | |
Determine the type of ACL Anthology page given its URL. | |
Args: | |
url (str): The URL to be checked. | |
Returns: | |
str: "paper", "author", or "venue". Returns None if the type can't be determined. | |
""" | |
# Extract last segments from the URL | |
segments = [segment for segment in url.split("/") if segment] | |
# Check if the URL points to an event (venue) | |
if "events" in url or "volumes" in url: | |
return "venue" | |
# If URL ends in a pattern like "2023.acl-long.1" it's a paper | |
if len(segments) > 1 and segments[-2].isnumeric() and "." in segments[-1]: | |
return "paper" | |
if "people" in url: | |
return "author" | |
# If none of the above rules apply, fetch the page and check its content | |
try: | |
html_doc = requests.get(url, timeout=10).text | |
soup = BeautifulSoup(html_doc, "html.parser") | |
# Check for unique elements specific to each page type | |
if soup.find("h2", id="title"): | |
return ( | |
"author" | |
if soup.find("a", href=True, text="Google Scholar") | |
else "paper" | |
) | |
elif soup.find("h1", text="Anthology Volume"): | |
return "venue" | |
except Exception as e: | |
print(f"Error determining page type: {e}") | |
return None | |
if __name__ == "__main__": | |
loop = asyncio.get_event_loop() | |
urls = [ | |
"https://aclanthology.org/2023.acl-long.1/", | |
"https://aclanthology.org/people/a/anna-rogers/", | |
"https://aclanthology.org/events/acl-2022/", | |
] | |
for url in urls: | |
if determine_page_type(url) == "paper": | |
print(f"Paper: {url}") | |
res = extract_paper_info(url) | |
paper = loop.run_until_complete( | |
async_match_acl_id_to_s2_paper(res["acl_id"]) | |
) | |
print(paper) | |
elif determine_page_type(url) == "author": | |
print(f"Author: {url}") | |
res = extract_author_info(url) | |
tasks = [ | |
async_match_acl_id_to_s2_paper(paper["url"].split("/")[-2]) | |
for paper in res["papers"] | |
] | |
s2_ids = loop.run_until_complete(asyncio.gather(*tasks)) | |
for paper, s2_id in zip(res["papers"], s2_ids): | |
print(paper["paperId"]) | |
elif determine_page_type(url) == "venue": | |
print(f"Venue: {url}") | |
res = extract_venue_info(url) | |
tasks = [ | |
async_match_acl_id_to_s2_paper(paper["url"].split("/")[-2]) | |
for paper in res["papers"] | |
] | |
s2_ids = loop.run_until_complete(asyncio.gather(*tasks)) | |
for paper, s2_id in zip(res["papers"], s2_ids): | |
print(paper["paperId"]) | |