Spaces:
Sleeping
Sleeping
# Copyright 2023 by Jan Philip Wahle, https://jpwahle.com/ | |
# All rights reserved. | |
import asyncio | |
import datetime | |
import os | |
from collections import Counter | |
from concurrent.futures import ThreadPoolExecutor, as_completed | |
from typing import List, Tuple | |
import aiohttp | |
import requests | |
from aclanthology import ( | |
async_match_acl_id_to_s2_paper, | |
extract_author_info, | |
extract_paper_info, | |
extract_venue_info, | |
) | |
from metrics import calculate_gini, calculate_gini_simpson | |
from pdf import parse_pdf_to_artcile_dict | |
def get_or_create_eventloop(): | |
""" | |
Get the current event loop or create a new one if there is no current event loop in the thread. | |
Returns: | |
The current event loop. | |
""" | |
try: | |
return asyncio.get_event_loop() | |
except RuntimeError as ex: | |
if "There is no current event loop in thread" in str(ex): | |
loop = asyncio.new_event_loop() | |
asyncio.set_event_loop(loop) | |
return asyncio.get_event_loop() | |
def send_s2_request(request_url): | |
""" | |
Sends a GET request to the specified URL with the S2 API key in the headers. | |
Args: | |
request_url (str): The URL to send the request to. | |
Returns: | |
requests.Response: The response object returned by the request. | |
""" | |
return requests.get( | |
request_url, | |
headers={"x-api-key": os.environ["s2apikey"]}, | |
timeout=10, | |
) | |
def check_s2_id_type(semantic_scholar_id): | |
""" | |
Check whether a given Semantic Scholar ID is valid for a paper or an author. | |
Args: | |
semantic_scholar_id (str): The Semantic Scholar ID to check. | |
Returns: | |
tuple: A tuple containing the type of the ID ("paper" or "author") and | |
the name of the author (if the ID is valid for an author), or "invalid" | |
if the ID is not valid for either a paper or an author. | |
""" | |
# First, check if it's a paper ID | |
paper_response = requests.get( | |
f"https://api.semanticscholar.org/v1/paper/{semantic_scholar_id}", | |
timeout=5, | |
) | |
# If the response status code is 200, it means the ID is valid for a paper | |
if paper_response.status_code == 200: | |
return "paper", None | |
# Next, check if it's an author ID | |
author_response = requests.get( | |
f"https://api.semanticscholar.org/v1/author/{semantic_scholar_id}", | |
timeout=5, | |
) | |
# If the response status code is 200, it means the ID is valid for an author | |
return ( | |
"author", | |
author_response.json()["name"] | |
if author_response.status_code == 200 | |
else "invalid", | |
) | |
def get_papers_from_author(ssid_author_id): | |
"""Retrieves all papers for a given author | |
Args: | |
ssid_author_id (str): semantic scholar id | |
Returns: | |
list: a list of all papers for the given author | |
""" | |
# Create request URL for an author | |
request_url = f"https://api.semanticscholar.org/graph/v1/author/{ssid_author_id}?fields=papers" | |
r = send_s2_request(request_url) | |
if r.status_code == 200: | |
papers = r.json().get("papers", []) | |
return [paper["paperId"] for paper in papers] | |
return [] | |
def compute_stats_for_references(s2_ref_paper_keys, year): | |
""" | |
Computes various statistics for a list of reference paper keys. | |
Args: | |
s2_ref_paper_keys (list): A list of Semantic Scholar paper keys for the references. | |
year (int): The year of the paper. | |
Returns: | |
tuple: A tuple containing the following statistics: | |
- num_references (int): The number of references. | |
- fields_of_study_counts (dict): A dictionary containing the count of each field of study. | |
- year_to_title_dict (dict): A dictionary mapping the year of each reference to its title. | |
- cfdi (float): The CFDI (Cumulative Field Diversity Index) of the references. | |
- cadi (float): The CADI (Cumulative Age Diversity Index) of the references. | |
- output_maoc (float): The MAOC (Mean Age of Citation) of the references. | |
If there are no valid references, returns a tuple of None values. | |
""" | |
# Go over the references of the paper | |
reference_year_list = [] | |
reference_title_list = [] | |
reference_fos_list = [] | |
with ThreadPoolExecutor() as executor: | |
request_url_refs = [ | |
f"https://api.semanticscholar.org/graph/v1/paper/{ref_paper_key}?fields=title,year,s2FieldsOfStudy" | |
for ref_paper_key in s2_ref_paper_keys | |
] | |
futures = [ | |
executor.submit(send_s2_request, request_url_ref) | |
for request_url_ref in request_url_refs | |
] | |
for future in as_completed(futures): | |
r_ref = future.result() | |
if r_ref.status_code == 200: | |
result_ref = r_ref.json() | |
(title_ref, year_ref, fields_ref) = ( | |
result_ref["title"], | |
result_ref["year"], | |
result_ref["s2FieldsOfStudy"], | |
) | |
reference_year_list.append(year_ref) | |
reference_title_list.append(title_ref) | |
reference_fos_list.extend( | |
field["category"] | |
for field in fields_ref | |
if field["source"] == "s2-fos-model" | |
) | |
else: | |
print( | |
f"Error retrieving reference {r_ref.status_code} for" | |
f" paper {s2_ref_paper_keys}" | |
) | |
# Remove all None from reference_year_list and reference_title_list | |
reference_year_list = [ | |
year_ref for year_ref in reference_year_list if year_ref is not None | |
] | |
reference_title_list = [ | |
title_ref | |
for title_ref in reference_title_list | |
if title_ref is not None | |
] | |
# Count references | |
num_references = len(reference_year_list) | |
# Flatten list and count occurrences | |
fields_of_study_counts = dict( | |
Counter( | |
[ | |
field | |
for field in reference_fos_list | |
if "Computer Science" not in field | |
] | |
) | |
) | |
# Citation age list | |
aoc_list = [ | |
year - year_ref | |
for year_ref in reference_year_list | |
if year_ref and year | |
] | |
if not aoc_list: | |
return None, None, None, None, None, None | |
# Compute citation age | |
output_maoc = sum(aoc_list) / len(aoc_list) | |
cadi = calculate_gini(aoc_list) | |
# Create a dictionary of year to title | |
year_to_title_dict = dict(zip(reference_year_list, reference_title_list)) | |
# Compute CFDI | |
cfdi = calculate_gini_simpson(fields_of_study_counts) | |
# Return the results | |
return ( | |
num_references, | |
fields_of_study_counts, | |
year_to_title_dict, | |
cfdi, | |
cadi, | |
output_maoc, | |
) | |
def compute_stats_for_s2_paper(ssid_paper_id): | |
""" | |
Computes statistics for a given paper ID using the Semantic Scholar API. | |
Args: | |
ssid_paper_id (str): The Semantic Scholar ID of the paper to compute statistics for. | |
Returns: | |
Tuple containing the following statistics: | |
- title_authors (str): The title and authors of the paper. | |
- num_references (int): The number of references in the paper. | |
- fields_of_study_counts (dict): A dictionary containing the count of each field of study in the paper's references. | |
- year_to_title_dict (dict): A dictionary mapping the year of each reference to its title. | |
- cfdi (float): The CFDI (Cumulative Field Diversity Index) of the paper's references. | |
- cadi (float): The CADI (Citation Age Diversity Index) of the paper's references. | |
- output_maoc (float): The MAOC (Mean Age of Citation) of the paper's references. | |
""" | |
# Get the paper and its references | |
request_url = f"https://api.semanticscholar.org/graph/v1/paper/{ssid_paper_id}?fields=references,title,year,authors" | |
r = send_s2_request(request_url) | |
if r.status_code == 200: # if successful request | |
result = r.json() | |
if not result.get("references") or result.get("references") == []: | |
return None, None, None, None, None, None, None, None | |
s2_ref_paper_keys = [ | |
reference_paper_tuple["paperId"] | |
for reference_paper_tuple in r.json()["references"] | |
] | |
filtered_s2_ref_paper_keys = [ | |
s2_ref_paper_key | |
for s2_ref_paper_key in s2_ref_paper_keys | |
if s2_ref_paper_key is not None | |
] | |
title, year, authors = ( | |
result["title"], | |
result["year"], | |
result["authors"], | |
) | |
title_authors = ( | |
title + "\n" + ", ".join([author["name"] for author in authors]) | |
) | |
( | |
num_references, | |
fields_of_study_counts, | |
year_to_title_dict, | |
cfdi, | |
cadi, | |
output_maoc, | |
) = compute_stats_for_references(filtered_s2_ref_paper_keys, year) | |
# Return the results | |
return ( | |
title_authors, | |
num_references, | |
fields_of_study_counts, | |
year_to_title_dict, | |
cfdi, | |
cadi, | |
output_maoc, | |
) | |
def compute_stats_for_s2_author(ssid_author_id, author_name): | |
""" | |
Computes statistics for an author based on their papers in the Semantic Scholar database. | |
Args: | |
ssid_author_id (str): The Semantic Scholar author ID. | |
author_name (str): The name of the author. | |
Returns: | |
dict: A dictionary containing statistics for the author, or None if no papers were found. | |
""" | |
if papers := get_papers_from_author(ssid_author_id): | |
return compute_stats_for_multiple_s2_papers(papers, author_name) | |
return None | |
def compute_stats_for_acl_paper(url): | |
""" | |
Computes statistics for a paper based on its ACL Anthology URL. | |
Args: | |
url (str): The URL of the paper on the ACL Anthology website. | |
Returns: | |
dict: A dictionary containing statistics for the paper, or None if the paper was not found. | |
""" | |
if paper_info := extract_paper_info(url): | |
loop = get_or_create_eventloop() | |
# Match paper ID to Semantic Scholar ID | |
s2_paper = loop.run_until_complete( | |
async_match_acl_id_to_s2_paper(paper_info["acl_id"]) | |
) | |
return compute_stats_for_s2_paper(s2_paper["paperId"]) | |
return None | |
def compute_stats_for_acl_author(url): | |
""" | |
Computes statistics for an author's papers in the ACL anthology. | |
Args: | |
url (str): The URL of the author's page on the ACL anthology website. | |
Returns: | |
dict: A dictionary containing statistics for the author's papers, including | |
the number of papers, the number of citations, and the h-index. | |
Returns None if the author's page cannot be accessed or no papers are found. | |
""" | |
if paper_info := extract_author_info(url): | |
loop = get_or_create_eventloop() | |
tasks = [ | |
async_match_acl_id_to_s2_paper(paper["url"].split("/")[-2]) | |
for paper in paper_info["papers"] | |
] | |
papers = loop.run_until_complete(asyncio.gather(*tasks)) | |
return compute_stats_for_multiple_s2_papers( | |
[paper["paperId"] for paper in papers if "paperId" in paper], | |
paper_info["author"], | |
) | |
return None | |
def compute_stats_for_acl_venue(url): | |
""" | |
Computes statistics for papers in a given ACL venue. | |
Args: | |
url (str): The URL of the ACL venue. | |
Returns: | |
dict: A dictionary containing statistics for the papers in the venue. | |
""" | |
if paper_info := extract_venue_info(url): | |
loop = get_or_create_eventloop() | |
tasks = [ | |
async_match_acl_id_to_s2_paper(paper["url"].split("/")[-2]) | |
for paper in paper_info["papers"] | |
] | |
papers = loop.run_until_complete(asyncio.gather(*tasks)) | |
return compute_stats_for_multiple_s2_papers( | |
[paper["paperId"] for paper in papers if "paperId" in paper], | |
paper_info["venue"], | |
) | |
return None | |
def compute_stats_for_multiple_s2_papers( | |
papers: List[dict], title: str | |
) -> Tuple[str, int, dict, dict, float, float, float]: | |
""" | |
Computes statistics for multiple S2 papers. | |
Args: | |
papers (List[dict]): A list of S2 papers. | |
title (str): The title of the papers. | |
Returns: | |
A tuple containing the following statistics: | |
- title (str): The title of the papers. | |
- num_references (int): The total number of references in all papers. | |
- top_fields (dict): A dictionary containing the top fields and their counts. | |
- oldest_paper_dict (dict): A dictionary containing the oldest paper for each year. | |
- cfdi (float): The average CFDI score for all papers. | |
- cadi (float): The average CADI score for all papers. | |
- output_maoc (float): The average output MAOC score for all papers. | |
""" | |
num_references = 0 | |
top_fields = {} | |
oldest_paper_dict = {} | |
cfdi = 0 | |
cadi = 0 | |
output_maoc = 0 | |
def process_paper(paper): | |
return compute_stats_for_s2_paper(paper) | |
with ThreadPoolExecutor() as executor: | |
results_list = list(executor.map(process_paper, papers)) | |
for results in results_list: | |
if not results or results[0] is None: | |
continue | |
num_references += results[1] | |
for field, count in results[2].items(): | |
top_fields[field] = top_fields.get(field, 0) + count | |
for year, ref_title in results[3].items(): | |
oldest_paper_dict[year] = ref_title | |
cfdi += results[4] | |
cadi += results[5] | |
output_maoc += results[6] | |
return ( | |
title, | |
num_references, | |
top_fields, | |
oldest_paper_dict, | |
cfdi / len(papers), | |
cadi / len(papers), | |
output_maoc / len(papers), | |
) | |
async def send_s2_async_request(url): | |
""" | |
Sends an asynchronous request to the specified URL and returns the response as a JSON object. | |
Args: | |
url (str): The URL to send the request to. | |
Returns: | |
dict: The response from the URL as a JSON object. | |
""" | |
async with aiohttp.ClientSession() as session: | |
async with session.get(url) as response: | |
return await response.json() | |
async def match_title_to_s2_paper(title, authors=None): | |
""" | |
Matches a given paper title (and authors) to Semantic Scholar to retrieve its S2 paper ID. | |
Args: | |
title (str): The title of the paper. | |
authors (List[str], optional): List of authors of the paper. Defaults to None. | |
Returns: | |
str or None: Returns the S2 paper ID if found, otherwise None. | |
""" | |
# Send a request to the Semantic Scholar API to search for the paper by its title | |
search_url = ( | |
f"http://api.semanticscholar.org/graph/v1/paper/search?query={title}" | |
) | |
# Send request | |
response = await send_s2_async_request(search_url) | |
results = response.get("data", []) | |
if len(results) > 0: | |
result = results[0] # Ranked by relevance | |
return result.get("paperId") | |
async def compute_stats_for_pdf(pdf_file): | |
""" | |
Computes statistics for a given PDF file. | |
Args: | |
pdf_file (file): The PDF file to compute statistics for. | |
Returns: | |
tuple: A tuple containing the title of the article and the computed statistics. | |
""" | |
s2_paper_ids = [] | |
article_dict = parse_pdf_to_artcile_dict(pdf_file.name) | |
references = article_dict["references"] | |
# Get S2 paper IDs asynchronously | |
tasks = [ | |
match_title_to_s2_paper(reference["title"], reference["authors"]) | |
for reference in references | |
if reference["title"] | |
] | |
s2_paper_ids = await asyncio.gather(*tasks) | |
# Remove all None values from s2paperids | |
s2_paper_ids = [s2_id for s2_id in s2_paper_ids if s2_id is not None] | |
# Compute the current year | |
today = datetime.date.today() | |
year = int(today.strftime("%Y")) | |
results = compute_stats_for_references(s2_paper_ids, year) | |
results = (article_dict["title"],) + results | |
return results | |