File size: 11,290 Bytes
505fd08
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
19b7e49
 
505fd08
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
# Copyright 2023 by Jan Philip Wahle, https://jpwahle.com/
# All rights reserved.
# Thanks to Mukund Rungta for inspiration on early versions of this demo https://huggingface.co/spaces/mrungta8/CitationalAmnesia


import asyncio

import gradio as gr

from aclanthology import determine_page_type
from plots import generate_cfdi_plot, generate_maoc_plot
from s2 import (check_s2_id_type, compute_stats_for_acl_author,
                compute_stats_for_acl_paper, compute_stats_for_acl_venue,
                compute_stats_for_pdf, compute_stats_for_s2_author,
                compute_stats_for_s2_paper)


def return_clear():
    """Clearing all demo inputs

    Returns:
        None
    """
    return None, None, None, None, None, None, None, None


def create_compute_stats(submit_type=None):
    def compute_stats(s2_id=None, pdf_file=None, acl_link=None):
        if submit_type == "s2_id" and s2_id:
            # Check if s2_id is a paper id or an author id
            id_type, author_name = check_s2_id_type(s2_id)
            if id_type == "paper":
                results = compute_stats_for_s2_paper(s2_id)
                results = results + ("paper",)
                return plot_and_return_stats(*results)
            if id_type == "author":
                results = compute_stats_for_s2_author(s2_id, author_name)
                results = results + ("author",)
                return plot_and_return_stats(*results)
        if submit_type == "acl_link" and acl_link:
            # Crawl all papers for the author or venue or just the paper if it is a paper link
            url_type = determine_page_type(acl_link)
            if url_type == "paper":
                results = compute_stats_for_acl_paper(acl_link)
                results = results + ("paper",)
                return plot_and_return_stats(*results)
            if url_type == "author":
                results = compute_stats_for_acl_author(acl_link)
                results = results + ("author",)
                return plot_and_return_stats(*results)
            if url_type == "venue":
                results = compute_stats_for_acl_venue(acl_link)
                results = results + ("proceedings",)
                return plot_and_return_stats(*results)
        if submit_type == "pdf_file" and pdf_file:
            # Compute the citation field diversity index and citation age diversity index
            results = asyncio.run(compute_stats_for_pdf(pdf_file))
            results = results + ("paper",)
            return plot_and_return_stats(*results)
        return None, None, None, None, None, None, None, None

    return compute_stats


def plot_and_return_stats(
    title_authors,
    num_references,
    field_counts,
    year_title_dict,
    cfdi,
    cadi,
    maoc,
    compute_type,
):
    """
    Plots the data and returns statistics.

    Args:
        title_authors (str): The title and authors of the paper.
        num_references (int): The number of references in the paper.
        field_counts (dict): A dictionary containing the count of each field.
        year_title_dict (dict): A dictionary containing the year and title of each paper.
        cfdi (list): A list of tuples containing the citation field and the number of papers in that field.
        cadi (list): A list of tuples containing the citation author and the number of papers by that author.
        maoc (list): A list of tuples containing the main author and the number of papers by that author.

    Returns:
        tuple: A tuple containing the title and authors of the paper, the number of references, the top 3 most cited fields,
        the most common oldest papers, the cfdi, cadi, and the plots for cfdi and maoc.
    """
    # Generate cfdi plot
    plot_cfdi = generate_cfdi_plot(cfdi, compute_type)

    # Generate cadi plot
    # plot_maoc = generate_maoc_plot(maoc, compute_type)

    # Get top 3 most cited fields
    top_fields_text = "\n".join(
        [
            f"{field}: {count}"
            for field, count in sorted(
                field_counts.items(), reverse=True, key=lambda x: x[1]
            )[:3]
        ]
    )
    
    cfdi = round(cfdi, 3)

    # Get most common oldest papers
    # oldest_paper_text = "".join(
    #     f"[{str(year)}] {title}" + "\n"
    #     for year, title in sorted(year_title_dict.items())[:3]
    # )

    return (
        title_authors,
        num_references,
        top_fields_text,
        # oldest_paper_text,
        cfdi,
        # cadi,
        plot_cfdi,
        # plot_maoc,
    )


with gr.Blocks(
    theme=gr.themes.Soft()
) as demo:
    with gr.Row():
        gr.Markdown(
            """
            # Citation Field Diversity Calculator

            Welcome to this interactive demo to analyze the field diversity aspect of your citational practice. This tool will enable you to reflect on a critical aspect:

            - By whom am I influenced? Which fields heavily inform and shape the research trajectory of my works?

            In addition, you will be able to analyze how the above compares to the average paper or author. The results you will receive cannot be categorized into “good” or “bad”. Instead, they are meant to raise self-awareness about one’s citational diversity and reflect on it. The results might bring you to further questions, such as:

            - Am I reading widely across fields?
            - Should I expand my literature search to include works from other fields?

            Using citations as a tangible marker of influence, our demo provides empirical insights into the influence of papers across fields.

            ## What is Citation Field Diversity?

            Field diversity is a measure of the variety of research fields that a paper or an author draws upon. A high field diversity indicates that the work draws from various distinct research fields, demonstrating a multidisciplinary influence on that work or author.
            
            ## What is the Citation Field Diversity Index (CFDI) and how is it calculated?

            The calculation of Field Diversity involves extracting all the references of a paper, categorizing them into distinct study fields, and determining the proportion of each study field over all the references. The Citation Field Diversity Index (CFDI) is then computed by applying the Gini Index on these proportions.

            For more details, please refer to Eq. 3 in [this paper](https://aclanthology.org/2023.acl-long.341/).
            """
        )

        gr.Markdown(
            """
            ## How do I Interpret CFDI?

            Higher values of CFDI indicate a greater diversity of a paper in terms of the fields it cites, signifying a multidisciplinary influence. On the other hand, lower values signify a lower diversity, indicating that citations are more concentrated in specific fields.

            ## How can I use this demo?

            There are three ways for you to compute the field diversity for papers:
            1. **Semantic Scholar ID**: Enter the Semantic Scholar ID of a **paper** or **author** and click the *"Compute"* button.
            2. **ACL Anthology Link**: Paste the ACL Anthology link of a **paper**, **venue**, or **author** and click the *"Compute"* button.
            3. **PDF File**: Upload your **paper** PDF and click the *"Compute"* button.

            To retrieve the **Semantic Scholar ID** for a paper such as "The Elephant in the Room: Analyzing the Presence of Big Tech in Natural Language Processing Research," search the paper on Semantic Scholar [here](https://www.semanticscholar.org/paper/The-Elephant-in-the-Room%3A-Analyzing-the-Presence-of-Abdalla-Wahle/587ffdfd7229e8e0dbc5250b44df5fad6251f6ad) and use the last part of the URL. The Semantic Scholar ID (SSID) for this paper is: **587ffdfd7229e8e0dbc5250b44df5fad6251f6ad**.

            To get an ACL Anthology link, you can go to any ACL Anthology paper, author or proceedings page and just copy and paste the url. For example:
            - https://aclanthology.org/2023.acl-long.1/
            - https://aclanthology.org/people/a/anna-rogers/
            - https://aclanthology.org/events/acl-2002/
            """
        )

    with gr.Row():
        with gr.Tabs():
            with gr.TabItem("Semantic Scholar ID"):
                s2_id = gr.Textbox(
                    label="Semantic Scholar ID",
                    placeholder=(
                        "Enter the Semantic Scholar ID here and press enter..."
                    ),
                    # value="587ffdfd7229e8e0dbc5250b44df5fad6251f6ad",
                )
                with gr.Row():
                    s2_submit_btn = gr.Button("Compute")
            with gr.TabItem("ACL Anthology Link"):
                acl_link = gr.Textbox(
                    label="ACL Anthology Link",
                    placeholder="Paste the ACL Anthology link here...",
                )
                with gr.Row():
                    acl_submit_btn = gr.Button("Compute")
            with gr.TabItem("PDF File"):
                pdf_file = gr.File(
                    file_types=[".pdf"], label="Upload your paper PDF"
                )
                with gr.Row():
                    file_submit_btn = gr.Button("Compute")
    with gr.Row():
        title = gr.Textbox(
            label="Title / Author Name / Venue Name:", lines=2
        )  # Can be either paper title, author name, or proceedings title
    with gr.Row():
        num_ref = gr.Textbox(label="Number of references", lines=3)
        top_field_list = gr.Textbox(label="Top 3 fields cited:", lines=3)
        # top_age_list = gr.Textbox(label="Top 3 oldest papers cited:", lines=3)
    with gr.Row():
        cfdi = gr.Textbox(label="CFDI")
        # cadi = gr.Textbox(label="CADI")
    with gr.Row():
        cfdi_plot = gr.Plot(label="Citation Field Diversity")
        # cadi_plot = gr.Plot(label="Citation Age Diversity")
    with gr.Row():
        clear_btn = gr.Button("Clear")

    submit_args = dict(
        inputs=[s2_id, pdf_file, acl_link],
        outputs=[
            title,
            num_ref,
            top_field_list,
            # top_age_list,
            cfdi,
            # cadi,
            cfdi_plot,
            # cadi_plot,
        ],
    )

    s2_submit_args = submit_args.copy()
    s2_submit_args["fn"] = create_compute_stats(submit_type="s2_id")

    acl_submit_args = submit_args.copy()
    acl_submit_args["fn"] = create_compute_stats(submit_type="acl_link")

    file_submit_args = submit_args.copy()
    file_submit_args["fn"] = create_compute_stats(submit_type="pdf_file")

    s2_id.submit(**s2_submit_args)
    acl_link.submit(**acl_submit_args)

    acl_submit_btn.click(**acl_submit_args)
    s2_submit_btn.click(**s2_submit_args)
    file_submit_btn.click(**file_submit_args)

    clear_btn.click(
        fn=return_clear,
        inputs=[],
        outputs=[
            title,
            num_ref,
            top_field_list,
            # top_age_list,
            cfdi,
            # cadi,
            cfdi_plot,
            # cadi_plot,
            s2_id,
            acl_link,
            pdf_file,
        ],
    )

demo.queue(concurrency_count=3)
demo.launch(server_port=7860, server_name="0.0.0.0")