Spaces:
Sleeping
Sleeping
# Copyright 2023 by Jan Philip Wahle, https://jpwahle.com/ | |
# All rights reserved. | |
import asyncio | |
import os | |
from collections import Counter | |
from concurrent.futures import ThreadPoolExecutor, as_completed | |
import requests | |
from aclanthology import ( | |
async_match_acl_id_to_s2_paper, | |
extract_author_info, | |
extract_paper_info, | |
extract_venue_info, | |
) | |
from metrics import calculate_gini, calculate_gini_simpson | |
def get_or_create_eventloop(): | |
try: | |
return asyncio.get_event_loop() | |
except RuntimeError as ex: | |
if "There is no current event loop in thread" in str(ex): | |
loop = asyncio.new_event_loop() | |
asyncio.set_event_loop(loop) | |
return asyncio.get_event_loop() | |
def send_s2_request(request_url): | |
""" | |
Sends a GET request to the specified URL with the S2 API key in the headers. | |
Args: | |
request_url (str): The URL to send the request to. | |
Returns: | |
requests.Response: The response object returned by the request. | |
""" | |
return requests.get( | |
request_url, | |
headers={"x-api-key": os.environ["s2apikey"]}, | |
timeout=10, | |
) | |
def check_s2_id_type(semantic_scholar_id): | |
""" | |
Check whether a given Semantic Scholar ID is valid for a paper or an author. | |
Args: | |
semantic_scholar_id (str): The Semantic Scholar ID to check. | |
Returns: | |
tuple: A tuple containing the type of the ID ("paper" or "author") and | |
the name of the author (if the ID is valid for an author), or "invalid" | |
if the ID is not valid for either a paper or an author. | |
""" | |
# Define the base URL for Semantic Scholar API | |
base_url = "https://api.semanticscholar.org/v1/" | |
# First, check if it's a paper ID | |
paper_response = requests.get( | |
f"{base_url}paper/{semantic_scholar_id}", timeout=5 | |
) | |
# If the response status code is 200, it means the ID is valid for a paper | |
if paper_response.status_code == 200: | |
return "paper", None | |
# Next, check if it's an author ID | |
author_response = requests.get( | |
f"{base_url}author/{semantic_scholar_id}", timeout=5 | |
) | |
# If the response status code is 200, it means the ID is valid for an author | |
return ( | |
"author", | |
author_response.json()["name"] | |
if author_response.status_code == 200 | |
else "invalid", | |
) | |
def get_papers_from_author(ssid_author_id): | |
"""Retrieves all papers for a given author | |
Args: | |
ssid_author_id (str): semantic scholar id | |
Returns: | |
list: a list of all papers for the given author | |
""" | |
# Create request URL for an author | |
request_url = f"https://api.semanticscholar.org/graph/v1/author/{ssid_author_id}?fields=papers" | |
r = send_s2_request(request_url) | |
if r.status_code == 200: | |
papers = r.json().get("papers", []) | |
return [paper["paperId"] for paper in papers] | |
return [] | |
def compute_stats_for_s2_paper(ssid_paper_id): | |
""" | |
Computes statistics for a given paper ID using the Semantic Scholar API. | |
Args: | |
ssid_paper_id (str): The Semantic Scholar ID of the paper to compute statistics for. | |
Returns: | |
Tuple containing the following statistics: | |
- title_authors (str): The title and authors of the paper. | |
- num_references (int): The number of references in the paper. | |
- fields_of_study_counts (dict): A dictionary containing the count of each field of study in the paper's references. | |
- year_to_title_dict (dict): A dictionary mapping the year of each reference to its title. | |
- cfdi (float): The CFDI (Cumulative Field Diversity Index) of the paper's references. | |
- cadi (float): The CADI (Citation Age Diversity Index) of the paper's references. | |
- output_maoc (float): The MAOC (Mean Age of Citation) of the paper's references. | |
""" | |
# Get the paper and its references | |
request_url = f"https://api.semanticscholar.org/graph/v1/paper/{ssid_paper_id}?fields=references,title,year,authors" | |
r = send_s2_request(request_url) | |
if r.status_code == 200: # if successful request | |
result = r.json() | |
if not result.get("references") or result.get("references") == []: | |
return None, None, None, None, None, None, None, None | |
s2_ref_paper_keys = [ | |
reference_paper_tuple["paperId"] | |
for reference_paper_tuple in r.json()["references"] | |
] | |
filtered_s2_ref_paper_keys = [ | |
s2_ref_paper_key | |
for s2_ref_paper_key in s2_ref_paper_keys | |
if s2_ref_paper_key is not None | |
] | |
title, year, authors = ( | |
result["title"], | |
result["year"], | |
result["authors"], | |
) | |
title_authors = ( | |
title + "\n" + ", ".join([author["name"] for author in authors]) | |
) | |
# Go over the references of the paper | |
reference_year_list = [] | |
reference_title_list = [] | |
reference_fos_list = [] | |
with ThreadPoolExecutor() as executor: | |
request_url_refs = [ | |
f"https://api.semanticscholar.org/graph/v1/paper/{ref_paper_key}?fields=title,year,s2FieldsOfStudy" | |
for ref_paper_key in filtered_s2_ref_paper_keys | |
] | |
futures = [ | |
executor.submit(send_s2_request, request_url_ref) | |
for request_url_ref in request_url_refs | |
] | |
for future in as_completed(futures): | |
r_ref = future.result() | |
if r_ref.status_code == 200: | |
result_ref = r_ref.json() | |
(title_ref, year_ref, fields_ref) = ( | |
result_ref["title"], | |
result_ref["year"], | |
result_ref["s2FieldsOfStudy"], | |
) | |
reference_year_list.append(year_ref) | |
reference_title_list.append(title_ref) | |
reference_fos_list.extend( | |
field["category"] | |
for field in fields_ref | |
if field["source"] == "s2-fos-model" | |
) | |
else: | |
print( | |
f"Error retrieving reference {r_ref.status_code} for" | |
f" paper {ssid_paper_id}" | |
) | |
# Remove all None from reference_year_list and reference_title_list | |
reference_year_list = [ | |
year_ref | |
for year_ref in reference_year_list | |
if year_ref is not None | |
] | |
reference_title_list = [ | |
title_ref | |
for title_ref in reference_title_list | |
if title_ref is not None | |
] | |
# Count references | |
num_references = len(reference_year_list) | |
# Flatten list and count occurrences | |
fields_of_study_counts = dict( | |
Counter( | |
[ | |
field | |
for field in reference_fos_list | |
if "Computer Science" not in field | |
] | |
) | |
) | |
# Citation age list | |
aoc_list = [ | |
year - year_ref | |
for year_ref in reference_year_list | |
if year_ref and year | |
] | |
if not aoc_list: | |
return None, None, None, None, None, None, None, None | |
# Compute citation age | |
output_maoc = sum(aoc_list) / len(aoc_list) | |
cadi = calculate_gini(aoc_list) | |
# Create a dictionary of year to title | |
year_to_title_dict = dict( | |
zip(reference_year_list, reference_title_list) | |
) | |
# Compute CFDI | |
cfdi = calculate_gini_simpson(fields_of_study_counts) | |
# Return the results | |
return ( | |
title_authors, | |
num_references, | |
fields_of_study_counts, | |
year_to_title_dict, | |
cfdi, | |
cadi, | |
output_maoc, | |
) | |
def compute_stats_for_s2_author(ssid_author_id, author_name): | |
""" | |
Computes statistics for an author based on their papers in the Semantic Scholar database. | |
Args: | |
ssid_author_id (str): The Semantic Scholar author ID. | |
author_name (str): The name of the author. | |
Returns: | |
dict: A dictionary containing statistics for the author, or None if no papers were found. | |
""" | |
if papers := get_papers_from_author(ssid_author_id): | |
return compute_stats_for_multiple_s2_papers(papers, author_name) | |
return None | |
def compute_stats_for_acl_paper(url): | |
""" | |
Computes statistics for a paper based on its ACL Anthology URL. | |
Args: | |
url (str): The URL of the paper on the ACL Anthology website. | |
Returns: | |
dict: A dictionary containing statistics for the paper, or None if the paper was not found. | |
""" | |
if paper_info := extract_paper_info(url): | |
loop = get_or_create_eventloop() | |
# Match paper ID to Semantic Scholar ID | |
s2_paper = loop.run_until_complete( | |
async_match_acl_id_to_s2_paper(paper_info["acl_id"]) | |
) | |
return compute_stats_for_s2_paper(s2_paper["paperId"]) | |
return None | |
import asyncio | |
def compute_stats_for_acl_author(url): | |
""" | |
Computes statistics for an author's papers in the ACL anthology. | |
Args: | |
url (str): The URL of the author's page on the ACL anthology website. | |
Returns: | |
dict: A dictionary containing statistics for the author's papers, including | |
the number of papers, the number of citations, and the h-index. | |
Returns None if the author's page cannot be accessed or no papers are found. | |
""" | |
if paper_info := extract_author_info(url): | |
loop = get_or_create_eventloop() | |
tasks = [ | |
async_match_acl_id_to_s2_paper(paper["url"].split("/")[-2]) | |
for paper in paper_info["papers"] | |
] | |
papers = loop.run_until_complete(asyncio.gather(*tasks)) | |
return compute_stats_for_multiple_s2_papers( | |
[paper["paperId"] for paper in papers if "paperId" in paper], | |
paper_info["author"], | |
) | |
return None | |
def compute_stats_for_acl_venue(url): | |
if paper_info := extract_venue_info(url): | |
loop = get_or_create_eventloop() | |
tasks = [ | |
async_match_acl_id_to_s2_paper(paper["url"].split("/")[-2]) | |
for paper in paper_info["papers"] | |
] | |
papers = loop.run_until_complete(asyncio.gather(*tasks)) | |
return compute_stats_for_multiple_s2_papers( | |
[paper["paperId"] for paper in papers if "paperId" in paper], | |
paper_info["venue"], | |
) | |
return None | |
def compute_stats_for_multiple_s2_papers(papers, title): | |
num_references = 0 | |
top_fields = {} | |
oldest_paper_dict = {} | |
cfdi = 0 | |
cadi = 0 | |
output_maoc = 0 | |
def process_paper(paper): | |
return compute_stats_for_s2_paper(paper) | |
with ThreadPoolExecutor() as executor: | |
results_list = list(executor.map(process_paper, papers)) | |
for results in results_list: | |
if not results or results[0] is None: | |
continue | |
num_references += results[1] | |
for field, count in results[2].items(): | |
top_fields[field] = top_fields.get(field, 0) + count | |
for year, title in results[3].items(): | |
oldest_paper_dict[year] = title | |
cfdi += results[4] | |
cadi += results[5] | |
output_maoc += results[6] | |
return ( | |
title, | |
num_references, | |
top_fields, | |
oldest_paper_dict, | |
cfdi / len(papers), | |
cadi / len(papers), | |
output_maoc / len(papers), | |
) | |