File size: 12,542 Bytes
0fba077
 
 
 
 
 
 
 
 
 
 
44a6195
 
 
 
0fba077
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
7574c0c
0fba077
 
 
7574c0c
0fba077
 
 
 
 
 
7574c0c
0fba077
 
 
7574c0c
0fba077
 
 
7574c0c
0fba077
7574c0c
 
 
 
 
0fba077
 
 
 
 
 
 
 
 
 
 
 
 
7574c0c
0fba077
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
7574c0c
0fba077
 
7574c0c
0fba077
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2becae6
 
 
 
0fba077
 
 
 
 
 
 
 
 
 
 
 
 
68c8475
0fba077
 
 
 
 
 
 
44a6195
0fba077
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
e73438b
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
# Copyright 2023 by Jan Philip Wahle, https://jpwahle.com/
# All rights reserved.
# Thanks to Mukund Rungta for inspiration on early versions of this demo https://huggingface.co/spaces/mrungta8/CitationalAmnesia


import asyncio

import gradio as gr

from aclanthology import determine_page_type
from plots import generate_cfdi_plot, generate_maoc_plot
from s2 import (check_s2_id_type, compute_stats_for_acl_author,
                compute_stats_for_acl_paper, compute_stats_for_acl_venue,
                compute_stats_for_pdf, compute_stats_for_s2_author,
                compute_stats_for_s2_paper)


def return_clear():
    """Clearing all demo inputs

    Returns:
        None
    """
    return None, None, None, None, None, None, None, None, None, None, None


def create_compute_stats(submit_type=None):
    def compute_stats(s2_id=None, pdf_file=None, acl_link=None):
        if submit_type == "s2_id" and s2_id:
            # Check if s2_id is a paper id or an author id
            id_type, author_name = check_s2_id_type(s2_id)
            if id_type == "paper":
                results = compute_stats_for_s2_paper(s2_id)
                results = results + ("paper",)
                return plot_and_return_stats(*results)
            if id_type == "author":
                results = compute_stats_for_s2_author(s2_id, author_name)
                results = results + ("author",)
                return plot_and_return_stats(*results)
        if submit_type == "acl_link" and acl_link:
            # Crawl all papers for the author or venue or just the paper if it is a paper link
            url_type = determine_page_type(acl_link)
            if url_type == "paper":
                results = compute_stats_for_acl_paper(acl_link)
                results = results + ("paper",)
                return plot_and_return_stats(*results)
            if url_type == "author":
                results = compute_stats_for_acl_author(acl_link)
                results = results + ("author",)
                return plot_and_return_stats(*results)
            if url_type == "venue":
                results = compute_stats_for_acl_venue(acl_link)
                results = results + ("proceedings",)
                return plot_and_return_stats(*results)
        if submit_type == "pdf_file" and pdf_file:
            # Compute the citation field diversity index and citation age diversity index
            results = asyncio.run(compute_stats_for_pdf(pdf_file))
            results = results + ("paper",)
            return plot_and_return_stats(*results)
        return None, None, None, None, None, None, None, None

    return compute_stats


def plot_and_return_stats(
    title_authors,
    num_references,
    field_counts,
    year_title_dict,
    cfdi,
    cadi,
    maoc,
    compute_type,
):
    """
    Plots the data and returns statistics.

    Args:
        title_authors (str): The title and authors of the paper.
        num_references (int): The number of references in the paper.
        field_counts (dict): A dictionary containing the count of each field.
        year_title_dict (dict): A dictionary containing the year and title of each paper.
        cfdi (list): A list of tuples containing the citation field and the number of papers in that field.
        cadi (list): A list of tuples containing the citation author and the number of papers by that author.
        maoc (list): A list of tuples containing the main author and the number of papers by that author.

    Returns:
        tuple: A tuple containing the title and authors of the paper, the number of references, the top 3 most cited fields,
        the most common oldest papers, the cfdi, cadi, and the plots for cfdi and maoc.
    """
    # Generate cfdi plot
    plot_cfdi = generate_cfdi_plot(cfdi, compute_type)

    # Generate cadi plot
    plot_maoc = generate_maoc_plot(maoc, compute_type)

    # Get top 3 most cited fields
    top_fields_text = "\n".join(
        [
            f"{field}: {count}"
            for field, count in sorted(
                field_counts.items(), reverse=True, key=lambda x: x[1]
            )[:3]
        ]
    )

    # Get most common oldest papers
    oldest_paper_text = "".join(
        f"[{str(year)}] {title}" + "\n"
        for year, title in sorted(year_title_dict.items())[:3]
    )

    # Round CFDI and CADI
    cfdi = round(cfdi, 3)
    cadi = round(cadi, 3)

    return (
        title_authors,
        num_references,
        top_fields_text,
        oldest_paper_text,
        cfdi,
        cadi,
        plot_cfdi,
        plot_maoc,
    )


with gr.Blocks(
    theme=gr.themes.Soft()
) as demo:
    with gr.Row():
        gr.Markdown(
            """
            # Citation Age and Field Diversity Calculator

            <div align="center">
                <img src="https://onedrive.live.com/embed?resid=684CB5200DB6B388%21682618&authkey=%21AILbTZikzXAbAyc&width=1310&height=728" />
            </div>

            Welcome to this interactive demo to analyze various aspects of your citational diversity. This tool will enable you to reflect on two critical aspects:

            - By whom am I influenced? Which fields heavily inform and shape the research trajectory of my works?
            - How far back in time do I cite? What are critical works (present and past) that shape my research?

            In addition, you will be able to analyze how the above compares to the average paper or author. The results you will receive can not be categorized into “good” or “bad”. Instead, they are meant to raise self-awareness about one’s citational diversity and reflect on it. The results might bring you to further questions, such as:

            - Am I reading widely across fields and time?
            - Should I expand my literature search to include works from other fields?
            - Are there ideas rooted in the past that can be used in an innovative way?

            Using citations as a tangible marker of influence, our demo provides empirical insights into the influence of papers across fields and time.

            ## What is Citation Field Diversity?

            Field diversity is a measure of the variety of research Fields that a paper or an author draws upon. A high field diversity indicates that the work draws from various distinct research fields, demonstrating a multidisciplinary influence on that work or author.

            ## What is Citation Age Diversity?

            Citation age is a measure of how far back in time a paper cites other papers. A high citation age shows that the work draws from past works, while a low citation age indicates that mostly recent work has influenced that paper.

            """
        )
        gr.Markdown(
            """
            ## What are the Citation Field Diversity Index (CFDI) and Citation Age Diversity Index (CADI) and how are they calculated?

            The calculation of Field Diversity involves extracting all the references of a paper, categorizing them into distinct study fields, and determining the proportion of each study field over all the references. The Citation Field Diversity Index (CFDI) is then computed by applying the Gini Index on these proportions.
            Calculating CADI is similar to CFDI but instead of determining the proportion of each study field, we determine the proportion of citation ages. If we take a paper from 2020 that cites two papers, one from 2010 and one from 1990, the citation ages are 10 and 30, respectively. The CADI is then computed by applying the Gini Index on these ages.
            For more details, please refer to Eq. 3 in [this paper](https://aclanthology.org/2023.acl-long.341/) and Eq. 4 in [this paper](https://arxiv.org/).
            
            ## How do I Interpret CFDI and CADI?

            For both indices, higher values indicate a greater diversity of a NLP paper (in terms of how far back it cites and in the fields it cites). On the other hand, lower values signify a lower diversity, indicating that citations are more concentrated in specific fields and time ranges. 
            
            ## How can I use this demo?

            There are three ways how you to compute the field and age diversity for papers:
            1. **Semantic Scholar ID**: Enter the Semantic Scholar ID of a **paper** or **author** and click the *"Compute"* button.
            2. **ACL Anthology Link**: Paste the ACL Anthology link of a **paper**, **venue**, or **author** and click the *"Compute"* button.
            3. **PDF File**: Upload your **paper** PDF and click the *"Compute"* button.
            
            To retrieve the **Semantic Scholar ID** for a paper such as "The Elephant in the Room: Analyzing the Presence of Big Tech in Natural Language Processing Research," search the paper on Semantic Scholar [here](https://www.semanticscholar.org/paper/The-Elephant-in-the-Room%3A-Analyzing-the-Presence-of-Abdalla-Wahle/587ffdfd7229e8e0dbc5250b44df5fad6251f6ad) and use the last part of the URL. The Semantic Scholar ID (SSID) for this paper is: **587ffdfd7229e8e0dbc5250b44df5fad6251f6ad**.        

            To get an ACL Anthology link, you can go to any ACL Anthology paper, author or proceedings page and just copy and paste the url. For example:
            - https://aclanthology.org/2023.acl-long.1/
            - https://aclanthology.org/people/a/anna-rogers/
            - https://aclanthology.org/events/acl-2002/
            """
        )

    with gr.Row():
        with gr.Tabs():
            with gr.TabItem("Semantic Scholar ID"):
                s2_id = gr.Textbox(
                    label="Semantic Scholar ID",
                    placeholder=(
                        "Enter the Semantic Scholar ID here and press enter..."
                    ),
                    # value="587ffdfd7229e8e0dbc5250b44df5fad6251f6ad",
                )
                with gr.Row():
                    s2_submit_btn = gr.Button("Compute")
            with gr.TabItem("ACL Anthology Link"):
                acl_link = gr.Textbox(
                    label="ACL Anthology Link",
                    placeholder="Paste the ACL Anthology link here...",
                )
                with gr.Row():
                    acl_submit_btn = gr.Button("Compute")
            with gr.TabItem("PDF File"):
                pdf_file = gr.File(
                    file_types=[".pdf"], label="Upload your paper PDF"
                )
                with gr.Row():
                    file_submit_btn = gr.Button("Compute")
    with gr.Row():
        title = gr.Textbox(
            label="Title / Author Name / Venue Name:", lines=2
        )  # Can be either paper title, author name, or proceedings title
    with gr.Row():
        num_ref = gr.Textbox(label="Number of references", lines=3)
        top_field_list = gr.Textbox(label="Top 3 fields cited:", lines=3)
        top_age_list = gr.Textbox(label="Top 3 oldest papers cited:", lines=3)
    with gr.Row():
        cfdi = gr.Textbox(label="CFDI")
        cadi = gr.Textbox(label="CADI")
    with gr.Row():
        cfdi_plot = gr.Plot(label="Citation Field Diversity")
        cadi_plot = gr.Plot(label="Citation Age Diversity")
    with gr.Row():
        clear_btn = gr.Button("Clear")

    submit_args = dict(
        inputs=[s2_id, pdf_file, acl_link],
        outputs=[
            title,
            num_ref,
            top_field_list,
            top_age_list,
            cfdi,
            cadi,
            cfdi_plot,
            cadi_plot,
        ],
    )

    s2_submit_args = submit_args.copy()
    s2_submit_args["fn"] = create_compute_stats(submit_type="s2_id")

    acl_submit_args = submit_args.copy()
    acl_submit_args["fn"] = create_compute_stats(submit_type="acl_link")

    file_submit_args = submit_args.copy()
    file_submit_args["fn"] = create_compute_stats(submit_type="pdf_file")

    s2_id.submit(**s2_submit_args)
    acl_link.submit(**acl_submit_args)

    acl_submit_btn.click(**acl_submit_args)
    s2_submit_btn.click(**s2_submit_args)
    file_submit_btn.click(**file_submit_args)

    clear_btn.click(
        fn=return_clear,
        inputs=[],
        outputs=[
            title,
            num_ref,
            top_field_list,
            top_age_list,
            cfdi,
            cadi,
            cfdi_plot,
            cadi_plot,
            s2_id,
            acl_link,
            pdf_file,
        ],
    )

demo.queue(concurrency_count=3)
demo.launch(server_port=7860, server_name="0.0.0.0")