Spaces:

jpwahle
/

field-time-diversity

Sleeping

File size: 12,542 Bytes

# Copyright 2023 by Jan Philip Wahle, https://jpwahle.com/
# All rights reserved.
# Thanks to Mukund Rungta for inspiration on early versions of this demo https://huggingface.co/spaces/mrungta8/CitationalAmnesia


import asyncio

import gradio as gr

from aclanthology import determine_page_type
from plots import generate_cfdi_plot, generate_maoc_plot
from s2 import (check_s2_id_type, compute_stats_for_acl_author,
                compute_stats_for_acl_paper, compute_stats_for_acl_venue,
                compute_stats_for_pdf, compute_stats_for_s2_author,
                compute_stats_for_s2_paper)


def return_clear():
    """Clearing all demo inputs

    Returns:
        None
    """
    return None, None, None, None, None, None, None, None, None, None, None


def create_compute_stats(submit_type=None):
    def compute_stats(s2_id=None, pdf_file=None, acl_link=None):
        if submit_type == "s2_id" and s2_id:
            # Check if s2_id is a paper id or an author id
            id_type, author_name = check_s2_id_type(s2_id)
            if id_type == "paper":
                results = compute_stats_for_s2_paper(s2_id)
                results = results + ("paper",)
                return plot_and_return_stats(*results)
            if id_type == "author":
                results = compute_stats_for_s2_author(s2_id, author_name)
                results = results + ("author",)
                return plot_and_return_stats(*results)
        if submit_type == "acl_link" and acl_link:
            # Crawl all papers for the author or venue or just the paper if it is a paper link
            url_type = determine_page_type(acl_link)
            if url_type == "paper":
                results = compute_stats_for_acl_paper(acl_link)
                results = results + ("paper",)
                return plot_and_return_stats(*results)
            if url_type == "author":
                results = compute_stats_for_acl_author(acl_link)
                results = results + ("author",)
                return plot_and_return_stats(*results)
            if url_type == "venue":
                results = compute_stats_for_acl_venue(acl_link)
                results = results + ("proceedings",)
                return plot_and_return_stats(*results)
        if submit_type == "pdf_file" and pdf_file:
            # Compute the citation field diversity index and citation age diversity index
            results = asyncio.run(compute_stats_for_pdf(pdf_file))
            results = results + ("paper",)
            return plot_and_return_stats(*results)
        return None, None, None, None, None, None, None, None

    return compute_stats


def plot_and_return_stats(
    title_authors,
    num_references,
    field_counts,
    year_title_dict,
    cfdi,
    cadi,
    maoc,
    compute_type,
):
    """
    Plots the data and returns statistics.

    Args:
        title_authors (str): The title and authors of the paper.
        num_references (int): The number of references in the paper.
        field_counts (dict): A dictionary containing the count of each field.
        year_title_dict (dict): A dictionary containing the year and title of each paper.
        cfdi (list): A list of tuples containing the citation field and the number of papers in that field.
        cadi (list): A list of tuples containing the citation author and the number of papers by that author.
        maoc (list): A list of tuples containing the main author and the number of papers by that author.

    Returns:
        tuple: A tuple containing the title and authors of the paper, the number of references, the top 3 most cited fields,
        the most common oldest papers, the cfdi, cadi, and the plots for cfdi and maoc.
    """
    # Generate cfdi plot
    plot_cfdi = generate_cfdi_plot(cfdi, compute_type)

    # Generate cadi plot
    plot_maoc = generate_maoc_plot(maoc, compute_type)

    # Get top 3 most cited fields
    top_fields_text = "\n".join(
        [
            f"{field}: {count}"
            for field, count in sorted(
                field_counts.items(), reverse=True, key=lambda x: x[1]
            )[:3]
        ]
    )

    # Get most common oldest papers
    oldest_paper_text = "".join(
        f"[{str(year)}] {title}" + "\n"
        for year, title in sorted(year_title_dict.items())[:3]
    )

    # Round CFDI and CADI
    cfdi = round(cfdi, 3)
    cadi = round(cadi, 3)

    return (
        title_authors,
        num_references,
        top_fields_text,
        oldest_paper_text,
        cfdi,
        cadi,
        plot_cfdi,
        plot_maoc,
    )


with gr.Blocks(
    theme=gr.themes.Soft()
) as demo:
    with gr.Row():
        gr.Markdown(
            """
            # Citation Age and Field Diversity Calculator

            <div align="center">
                <img src="https://onedrive.live.com/embed?resid=684CB5200DB6B388%21682618&authkey=%21AILbTZikzXAbAyc&width=1310&height=728" />
            </div>

            Welcome to this interactive demo to analyze various aspects of your citational diversity. This tool will enable you to reflect on two critical aspects:

            - By whom am I influenced? Which fields heavily inform and shape the research trajectory of my works?
            - How far back in time do I cite? What are critical works (present and past) that shape my research?

            In addition, you will be able to analyze how the above compares to the average paper or author. The results you will receive can not be categorized into “good” or “bad”. Instead, they are meant to raise self-awareness about one’s citational diversity and reflect on it. The results might bring you to further questions, such as:

            - Am I reading widely across fields and time?
            - Should I expand my literature search to include works from other fields?
            - Are there ideas rooted in the past that can be used in an innovative way?

            Using citations as a tangible marker of influence, our demo provides empirical insights into the influence of papers across fields and time.

            ## What is Citation Field Diversity?

            Field diversity is a measure of the variety of research Fields that a paper or an author draws upon. A high field diversity indicates that the work draws from various distinct research fields, demonstrating a multidisciplinary influence on that work or author.

            ## What is Citation Age Diversity?

            Citation age is a measure of how far back in time a paper cites other papers. A high citation age shows that the work draws from past works, while a low citation age indicates that mostly recent work has influenced that paper.

            """
        )
        gr.Markdown(
            """
            ## What are the Citation Field Diversity Index (CFDI) and Citation Age Diversity Index (CADI) and how are they calculated?

            The calculation of Field Diversity involves extracting all the references of a paper, categorizing them into distinct study fields, and determining the proportion of each study field over all the references. The Citation Field Diversity Index (CFDI) is then computed by applying the Gini Index on these proportions.
            Calculating CADI is similar to CFDI but instead of determining the proportion of each study field, we determine the proportion of citation ages. If we take a paper from 2020 that cites two papers, one from 2010 and one from 1990, the citation ages are 10 and 30, respectively. The CADI is then computed by applying the Gini Index on these ages.
            For more details, please refer to Eq. 3 in [this paper](https://aclanthology.org/2023.acl-long.341/) and Eq. 4 in [this paper](https://arxiv.org/).
            
            ## How do I Interpret CFDI and CADI?

            For both indices, higher values indicate a greater diversity of a NLP paper (in terms of how far back it cites and in the fields it cites). On the other hand, lower values signify a lower diversity, indicating that citations are more concentrated in specific fields and time ranges. 
            
            ## How can I use this demo?

            There are three ways how you to compute the field and age diversity for papers:
            1. **Semantic Scholar ID**: Enter the Semantic Scholar ID of a **paper** or **author** and click the *"Compute"* button.
            2. **ACL Anthology Link**: Paste the ACL Anthology link of a **paper**, **venue**, or **author** and click the *"Compute"* button.
            3. **PDF File**: Upload your **paper** PDF and click the *"Compute"* button.
            
            To retrieve the **Semantic Scholar ID** for a paper such as "The Elephant in the Room: Analyzing the Presence of Big Tech in Natural Language Processing Research," search the paper on Semantic Scholar [here](https://www.semanticscholar.org/paper/The-Elephant-in-the-Room%3A-Analyzing-the-Presence-of-Abdalla-Wahle/587ffdfd7229e8e0dbc5250b44df5fad6251f6ad) and use the last part of the URL. The Semantic Scholar ID (SSID) for this paper is: **587ffdfd7229e8e0dbc5250b44df5fad6251f6ad**.        

            To get an ACL Anthology link, you can go to any ACL Anthology paper, author or proceedings page and just copy and paste the url. For example:
            - https://aclanthology.org/2023.acl-long.1/
            - https://aclanthology.org/people/a/anna-rogers/
            - https://aclanthology.org/events/acl-2002/
            """
        )

    with gr.Row():
        with gr.Tabs():
            with gr.TabItem("Semantic Scholar ID"):
                s2_id = gr.Textbox(
                    label="Semantic Scholar ID",
                    placeholder=(
                        "Enter the Semantic Scholar ID here and press enter..."
                    ),
                    # value="587ffdfd7229e8e0dbc5250b44df5fad6251f6ad",
                )
                with gr.Row():
                    s2_submit_btn = gr.Button("Compute")
            with gr.TabItem("ACL Anthology Link"):
                acl_link = gr.Textbox(
                    label="ACL Anthology Link",
                    placeholder="Paste the ACL Anthology link here...",
                )
                with gr.Row():
                    acl_submit_btn = gr.Button("Compute")
            with gr.TabItem("PDF File"):
                pdf_file = gr.File(
                    file_types=[".pdf"], label="Upload your paper PDF"
                )
                with gr.Row():
                    file_submit_btn = gr.Button("Compute")
    with gr.Row():
        title = gr.Textbox(
            label="Title / Author Name / Venue Name:", lines=2
        )  # Can be either paper title, author name, or proceedings title
    with gr.Row():
        num_ref = gr.Textbox(label="Number of references", lines=3)
        top_field_list = gr.Textbox(label="Top 3 fields cited:", lines=3)
        top_age_list = gr.Textbox(label="Top 3 oldest papers cited:", lines=3)
    with gr.Row():
        cfdi = gr.Textbox(label="CFDI")
        cadi = gr.Textbox(label="CADI")
    with gr.Row():
        cfdi_plot = gr.Plot(label="Citation Field Diversity")
        cadi_plot = gr.Plot(label="Citation Age Diversity")
    with gr.Row():
        clear_btn = gr.Button("Clear")

    submit_args = dict(
        inputs=[s2_id, pdf_file, acl_link],
        outputs=[
            title,
            num_ref,
            top_field_list,
            top_age_list,
            cfdi,
            cadi,
            cfdi_plot,
            cadi_plot,
        ],
    )

    s2_submit_args = submit_args.copy()
    s2_submit_args["fn"] = create_compute_stats(submit_type="s2_id")

    acl_submit_args = submit_args.copy()
    acl_submit_args["fn"] = create_compute_stats(submit_type="acl_link")

    file_submit_args = submit_args.copy()
    file_submit_args["fn"] = create_compute_stats(submit_type="pdf_file")

    s2_id.submit(**s2_submit_args)
    acl_link.submit(**acl_submit_args)

    acl_submit_btn.click(**acl_submit_args)
    s2_submit_btn.click(**s2_submit_args)
    file_submit_btn.click(**file_submit_args)

    clear_btn.click(
        fn=return_clear,
        inputs=[],
        outputs=[
            title,
            num_ref,
            top_field_list,
            top_age_list,
            cfdi,
            cadi,
            cfdi_plot,
            cadi_plot,
            s2_id,
            acl_link,
            pdf_file,
        ],
    )

demo.queue(concurrency_count=3)
demo.launch(server_port=7860, server_name="0.0.0.0")