Spaces:

jpwahle
/

field-diversity

Running

App Files Files Community

field-diversity / aclanthology.py

jpwahle

Initial commit

505fd08 7 months ago

raw history blame contribute delete

No virus

6.42 kB

	# Copyright 2023 by Jan Philip Wahle, https://jpwahle.com/
	# All rights reserved.

	import asyncio
	import json

	import aiohttp
	import requests
	from bs4 import BeautifulSoup


	async def fetch(session, url):
	"""Asynchronous function to fetch a URL using aiohttp."""
	async with session.get(url) as response:
	return await response.text()


	async def async_match_acl_id_to_s2_paper(acl_id):
	"""
	Fetches the paper information from the Semantic Scholar API for the given ACL ID.

	Args:
	acl_id (str): The ACL ID of the paper to fetch.

	Returns:
	dict: A dictionary containing the paper information.
	"""
	url = f"https://api.semanticscholar.org/graph/v1/paper/ACL:{acl_id}"
	async with aiohttp.ClientSession() as session:
	res_text = await fetch(session, url)
	return json.loads(res_text)


	def extract_paper_info(paper_url):
	"""
	Extracts information about a paper from its ACL Anthology URL.

	Args:
	paper_url (str): The URL of the paper on the ACL Anthology website.

	Returns:
	dict: A dictionary containing the title, authors, and ACL Anthology ID of the paper.
	"""
	html_doc = requests.get(paper_url, timeout=10).text
	soup = BeautifulSoup(html_doc, "html.parser")

	title = soup.find("h2", id="title").text.strip()
	authors = [
	a.text
	for a in soup.find_all("a")
	if a.parent.name == "p" and a.parent["class"] == ["lead"]
	]
	acl_id = paper_url.split("/")[-2]

	return {"title": title, "authors": authors, "acl_id": acl_id}


	def extract_author_info(author_url):
	"""
	Extracts author information from the given author URL.

	Args:
	author_url (str): The URL of the author's page on ACL Anthology.

	Returns:
	dict: A dictionary containing the author's name and a list of their papers.
	Each paper is represented as a dictionary with keys "title" and "url".
	"""
	html_doc = requests.get(author_url, timeout=10).text
	soup = BeautifulSoup(html_doc, "html.parser")

	author_name = soup.find("h2", id="title").text.strip()
	paper_elements = soup.find_all("p")
	papers = []
	for paper in paper_elements:
	links = paper.find_all("a")
	# Filter out a with text pdf and bib
	links = [
	l for l in links if l.text.strip() not in ["pdf", "bib", "abs"]
	]
	if not links:
	continue
	title = links[0].text.strip()
	url = "https://aclanthology.org" + links[0]["href"]
	papers.append({"title": title, "url": url})

	return {"author": author_name, "papers": papers}


	def extract_venue_info(venue_url):
	"""
	Extracts venue information from the given URL.

	Args:
	venue_url (str): The URL of the venue to extract information from.

	Returns:
	dict: A dictionary containing the venue name and a list of papers with their titles and URLs.
	"""
	html_doc = requests.get(venue_url, timeout=10).text
	soup = BeautifulSoup(html_doc, "html.parser")

	venue_name = soup.find("h2", id="title").text.strip()
	paper_elements = soup.find_all("p")
	papers = []
	for paper in paper_elements:
	links = paper.find_all("a")
	# Filter out a with text pdf and bib
	links = [
	l for l in links if l.text.strip() not in ["pdf", "bib", "abs"]
	]
	if not links:
	continue
	title = links[0].text.strip()
	url = "https://aclanthology.org" + links[0]["href"]
	papers.append({"title": title, "url": url})

	return {"venue": venue_name, "papers": papers}


	def determine_page_type(url):
	"""
	Determine the type of ACL Anthology page given its URL.

	Args:
	url (str): The URL to be checked.

	Returns:
	str: "paper", "author", or "venue". Returns None if the type can't be determined.
	"""
	# Extract last segments from the URL
	segments = [segment for segment in url.split("/") if segment]

	# Check if the URL points to an event (venue)
	if "events" in url or "volumes" in url:
	return "venue"

	# If URL ends in a pattern like "2023.acl-long.1" it's a paper
	if len(segments) > 1 and segments[-2].isnumeric() and "." in segments[-1]:
	return "paper"

	if "people" in url:
	return "author"

	# If none of the above rules apply, fetch the page and check its content
	try:
	html_doc = requests.get(url, timeout=10).text
	soup = BeautifulSoup(html_doc, "html.parser")

	# Check for unique elements specific to each page type
	if soup.find("h2", id="title"):
	return (
	"author"
	if soup.find("a", href=True, text="Google Scholar")
	else "paper"
	)
	elif soup.find("h1", text="Anthology Volume"):
	return "venue"
	except Exception as e:
	print(f"Error determining page type: {e}")

	return None


	if __name__ == "__main__":
	loop = asyncio.get_event_loop()

	urls = [
	"https://aclanthology.org/2023.acl-long.1/",
	"https://aclanthology.org/people/a/anna-rogers/",
	"https://aclanthology.org/events/acl-2022/",
	]

	for url in urls:
	if determine_page_type(url) == "paper":
	print(f"Paper: {url}")
	res = extract_paper_info(url)
	paper = loop.run_until_complete(
	async_match_acl_id_to_s2_paper(res["acl_id"])
	)
	print(paper)

	elif determine_page_type(url) == "author":
	print(f"Author: {url}")
	res = extract_author_info(url)
	tasks = [
	async_match_acl_id_to_s2_paper(paper["url"].split("/")[-2])
	for paper in res["papers"]
	]
	s2_ids = loop.run_until_complete(asyncio.gather(*tasks))
	for paper, s2_id in zip(res["papers"], s2_ids):
	print(paper["paperId"])

	elif determine_page_type(url) == "venue":
	print(f"Venue: {url}")
	res = extract_venue_info(url)
	tasks = [
	async_match_acl_id_to_s2_paper(paper["url"].split("/")[-2])
	for paper in res["papers"]
	]
	s2_ids = loop.run_until_complete(asyncio.gather(*tasks))
	for paper, s2_id in zip(res["papers"], s2_ids):
	print(paper["paperId"])