File size: 6,418 Bytes
505fd08
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
# Copyright 2023 by Jan Philip Wahle, https://jpwahle.com/
# All rights reserved.

import asyncio
import json

import aiohttp
import requests
from bs4 import BeautifulSoup


async def fetch(session, url):
    """Asynchronous function to fetch a URL using aiohttp."""
    async with session.get(url) as response:
        return await response.text()


async def async_match_acl_id_to_s2_paper(acl_id):
    """
    Fetches the paper information from the Semantic Scholar API for the given ACL ID.

    Args:
        acl_id (str): The ACL ID of the paper to fetch.

    Returns:
        dict: A dictionary containing the paper information.
    """
    url = f"https://api.semanticscholar.org/graph/v1/paper/ACL:{acl_id}"
    async with aiohttp.ClientSession() as session:
        res_text = await fetch(session, url)
    return json.loads(res_text)


def extract_paper_info(paper_url):
    """
    Extracts information about a paper from its ACL Anthology URL.

    Args:
        paper_url (str): The URL of the paper on the ACL Anthology website.

    Returns:
        dict: A dictionary containing the title, authors, and ACL Anthology ID of the paper.
    """
    html_doc = requests.get(paper_url, timeout=10).text
    soup = BeautifulSoup(html_doc, "html.parser")

    title = soup.find("h2", id="title").text.strip()
    authors = [
        a.text
        for a in soup.find_all("a")
        if a.parent.name == "p" and a.parent["class"] == ["lead"]
    ]
    acl_id = paper_url.split("/")[-2]

    return {"title": title, "authors": authors, "acl_id": acl_id}


def extract_author_info(author_url):
    """
    Extracts author information from the given author URL.

    Args:
        author_url (str): The URL of the author's page on ACL Anthology.

    Returns:
        dict: A dictionary containing the author's name and a list of their papers.
              Each paper is represented as a dictionary with keys "title" and "url".
    """
    html_doc = requests.get(author_url, timeout=10).text
    soup = BeautifulSoup(html_doc, "html.parser")

    author_name = soup.find("h2", id="title").text.strip()
    paper_elements = soup.find_all("p")
    papers = []
    for paper in paper_elements:
        links = paper.find_all("a")
        # Filter out a with text pdf and bib
        links = [
            l for l in links if l.text.strip() not in ["pdf", "bib", "abs"]
        ]
        if not links:
            continue
        title = links[0].text.strip()
        url = "https://aclanthology.org" + links[0]["href"]
        papers.append({"title": title, "url": url})

    return {"author": author_name, "papers": papers}


def extract_venue_info(venue_url):
    """
    Extracts venue information from the given URL.

    Args:
        venue_url (str): The URL of the venue to extract information from.

    Returns:
        dict: A dictionary containing the venue name and a list of papers with their titles and URLs.
    """
    html_doc = requests.get(venue_url, timeout=10).text
    soup = BeautifulSoup(html_doc, "html.parser")

    venue_name = soup.find("h2", id="title").text.strip()
    paper_elements = soup.find_all("p")
    papers = []
    for paper in paper_elements:
        links = paper.find_all("a")
        # Filter out a with text pdf and bib
        links = [
            l for l in links if l.text.strip() not in ["pdf", "bib", "abs"]
        ]
        if not links:
            continue
        title = links[0].text.strip()
        url = "https://aclanthology.org" + links[0]["href"]
        papers.append({"title": title, "url": url})

    return {"venue": venue_name, "papers": papers}


def determine_page_type(url):
    """
    Determine the type of ACL Anthology page given its URL.

    Args:
        url (str): The URL to be checked.

    Returns:
        str: "paper", "author", or "venue". Returns None if the type can't be determined.
    """
    # Extract last segments from the URL
    segments = [segment for segment in url.split("/") if segment]

    # Check if the URL points to an event (venue)
    if "events" in url or "volumes" in url:
        return "venue"

    # If URL ends in a pattern like "2023.acl-long.1" it's a paper
    if len(segments) > 1 and segments[-2].isnumeric() and "." in segments[-1]:
        return "paper"

    if "people" in url:
        return "author"

        # If none of the above rules apply, fetch the page and check its content
    try:
        html_doc = requests.get(url, timeout=10).text
        soup = BeautifulSoup(html_doc, "html.parser")

        # Check for unique elements specific to each page type
        if soup.find("h2", id="title"):
            return (
                "author"
                if soup.find("a", href=True, text="Google Scholar")
                else "paper"
            )
        elif soup.find("h1", text="Anthology Volume"):
            return "venue"
    except Exception as e:
        print(f"Error determining page type: {e}")

    return None


if __name__ == "__main__":
    loop = asyncio.get_event_loop()

    urls = [
        "https://aclanthology.org/2023.acl-long.1/",
        "https://aclanthology.org/people/a/anna-rogers/",
        "https://aclanthology.org/events/acl-2022/",
    ]

    for url in urls:
        if determine_page_type(url) == "paper":
            print(f"Paper: {url}")
            res = extract_paper_info(url)
            paper = loop.run_until_complete(
                async_match_acl_id_to_s2_paper(res["acl_id"])
            )
            print(paper)

        elif determine_page_type(url) == "author":
            print(f"Author: {url}")
            res = extract_author_info(url)
            tasks = [
                async_match_acl_id_to_s2_paper(paper["url"].split("/")[-2])
                for paper in res["papers"]
            ]
            s2_ids = loop.run_until_complete(asyncio.gather(*tasks))
            for paper, s2_id in zip(res["papers"], s2_ids):
                print(paper["paperId"])

        elif determine_page_type(url) == "venue":
            print(f"Venue: {url}")
            res = extract_venue_info(url)
            tasks = [
                async_match_acl_id_to_s2_paper(paper["url"].split("/")[-2])
                for paper in res["papers"]
            ]
            s2_ids = loop.run_until_complete(asyncio.gather(*tasks))
            for paper, s2_id in zip(res["papers"], s2_ids):
                print(paper["paperId"])