Spaces:

jpwahle
/

field-diversity

Running

File size: 6,418 Bytes

505fd08

# Copyright 2023 by Jan Philip Wahle, https://jpwahle.com/
# All rights reserved.

import asyncio
import json

import aiohttp
import requests
from bs4 import BeautifulSoup


async def fetch(session, url):
    """Asynchronous function to fetch a URL using aiohttp."""
    async with session.get(url) as response:
        return await response.text()


async def async_match_acl_id_to_s2_paper(acl_id):
    """
    Fetches the paper information from the Semantic Scholar API for the given ACL ID.

    Args:
        acl_id (str): The ACL ID of the paper to fetch.

    Returns:
        dict: A dictionary containing the paper information.
    """
    url = f"https://api.semanticscholar.org/graph/v1/paper/ACL:{acl_id}"
    async with aiohttp.ClientSession() as session:
        res_text = await fetch(session, url)
    return json.loads(res_text)


def extract_paper_info(paper_url):
    """
    Extracts information about a paper from its ACL Anthology URL.

    Args:
        paper_url (str): The URL of the paper on the ACL Anthology website.

    Returns:
        dict: A dictionary containing the title, authors, and ACL Anthology ID of the paper.
    """
    html_doc = requests.get(paper_url, timeout=10).text
    soup = BeautifulSoup(html_doc, "html.parser")

    title = soup.find("h2", id="title").text.strip()
    authors = [
        a.text
        for a in soup.find_all("a")
        if a.parent.name == "p" and a.parent["class"] == ["lead"]
    ]
    acl_id = paper_url.split("/")[-2]

    return {"title": title, "authors": authors, "acl_id": acl_id}


def extract_author_info(author_url):
    """
    Extracts author information from the given author URL.

    Args:
        author_url (str): The URL of the author's page on ACL Anthology.

    Returns:
        dict: A dictionary containing the author's name and a list of their papers.
              Each paper is represented as a dictionary with keys "title" and "url".
    """
    html_doc = requests.get(author_url, timeout=10).text
    soup = BeautifulSoup(html_doc, "html.parser")

    author_name = soup.find("h2", id="title").text.strip()
    paper_elements = soup.find_all("p")
    papers = []
    for paper in paper_elements:
        links = paper.find_all("a")
        # Filter out a with text pdf and bib
        links = [
            l for l in links if l.text.strip() not in ["pdf", "bib", "abs"]
        ]
        if not links:
            continue
        title = links[0].text.strip()
        url = "https://aclanthology.org" + links[0]["href"]
        papers.append({"title": title, "url": url})

    return {"author": author_name, "papers": papers}


def extract_venue_info(venue_url):
    """
    Extracts venue information from the given URL.

    Args:
        venue_url (str): The URL of the venue to extract information from.

    Returns:
        dict: A dictionary containing the venue name and a list of papers with their titles and URLs.
    """
    html_doc = requests.get(venue_url, timeout=10).text
    soup = BeautifulSoup(html_doc, "html.parser")

    venue_name = soup.find("h2", id="title").text.strip()
    paper_elements = soup.find_all("p")
    papers = []
    for paper in paper_elements:
        links = paper.find_all("a")
        # Filter out a with text pdf and bib
        links = [
            l for l in links if l.text.strip() not in ["pdf", "bib", "abs"]
        ]
        if not links:
            continue
        title = links[0].text.strip()
        url = "https://aclanthology.org" + links[0]["href"]
        papers.append({"title": title, "url": url})

    return {"venue": venue_name, "papers": papers}


def determine_page_type(url):
    """
    Determine the type of ACL Anthology page given its URL.

    Args:
        url (str): The URL to be checked.

    Returns:
        str: "paper", "author", or "venue". Returns None if the type can't be determined.
    """
    # Extract last segments from the URL
    segments = [segment for segment in url.split("/") if segment]

    # Check if the URL points to an event (venue)
    if "events" in url or "volumes" in url:
        return "venue"

    # If URL ends in a pattern like "2023.acl-long.1" it's a paper
    if len(segments) > 1 and segments[-2].isnumeric() and "." in segments[-1]:
        return "paper"

    if "people" in url:
        return "author"

        # If none of the above rules apply, fetch the page and check its content
    try:
        html_doc = requests.get(url, timeout=10).text
        soup = BeautifulSoup(html_doc, "html.parser")

        # Check for unique elements specific to each page type
        if soup.find("h2", id="title"):
            return (
                "author"
                if soup.find("a", href=True, text="Google Scholar")
                else "paper"
            )
        elif soup.find("h1", text="Anthology Volume"):
            return "venue"
    except Exception as e:
        print(f"Error determining page type: {e}")

    return None


if __name__ == "__main__":
    loop = asyncio.get_event_loop()

    urls = [
        "https://aclanthology.org/2023.acl-long.1/",
        "https://aclanthology.org/people/a/anna-rogers/",
        "https://aclanthology.org/events/acl-2022/",
    ]

    for url in urls:
        if determine_page_type(url) == "paper":
            print(f"Paper: {url}")
            res = extract_paper_info(url)
            paper = loop.run_until_complete(
                async_match_acl_id_to_s2_paper(res["acl_id"])
            )
            print(paper)

        elif determine_page_type(url) == "author":
            print(f"Author: {url}")
            res = extract_author_info(url)
            tasks = [
                async_match_acl_id_to_s2_paper(paper["url"].split("/")[-2])
                for paper in res["papers"]
            ]
            s2_ids = loop.run_until_complete(asyncio.gather(*tasks))
            for paper, s2_id in zip(res["papers"], s2_ids):
                print(paper["paperId"])

        elif determine_page_type(url) == "venue":
            print(f"Venue: {url}")
            res = extract_venue_info(url)
            tasks = [
                async_match_acl_id_to_s2_paper(paper["url"].split("/")[-2])
                for paper in res["papers"]
            ]
            s2_ids = loop.run_until_complete(asyncio.gather(*tasks))
            for paper, s2_id in zip(res["papers"], s2_ids):
                print(paper["paperId"])